summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c11
-rw-r--r--fs/afs/dir_silly.c3
-rw-r--r--fs/afs/fs_operation.c6
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/inode.c6
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c5
-rw-r--r--fs/autofs/autofs_i.h1
-rw-r--r--fs/autofs/expire.c2
-rw-r--r--fs/autofs/waitq.c72
-rw-r--r--fs/befs/TODO14
-rw-r--r--fs/binfmt_flat.c18
-rw-r--r--fs/block_dev.c40
-rw-r--r--fs/btrfs/compression.c54
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/extent-tree.c8
-rw-r--r--fs/btrfs/extent_io.c31
-rw-r--r--fs/btrfs/file-item.c108
-rw-r--r--fs/btrfs/file.c35
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c59
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c16
-rw-r--r--fs/btrfs/reflink.c39
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/tree-log.c44
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/btrfs/zoned.c9
-rw-r--r--fs/btrfs/zoned.h5
-rw-r--r--fs/btrfs/zstd.c5
-rw-r--r--fs/buffer.c42
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c626
-rw-r--r--fs/ceph/cache.c125
-rw-r--r--fs/ceph/cache.h101
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/debugfs.c12
-rw-r--r--fs/ceph/dir.c34
-rw-r--r--fs/ceph/export.c12
-rw-r--r--fs/ceph/file.c52
-rw-r--r--fs/ceph/inode.c36
-rw-r--r--fs/ceph/io.c2
-rw-r--r--fs/ceph/mds_client.c20
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/metric.c62
-rw-r--r--fs/ceph/metric.h56
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.h32
-rw-r--r--fs/ceph/xattr.c7
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h48
-rw-r--r--fs/cifs/cifsfs.c25
-rw-r--r--fs/cifs/cifsglob.h17
-rw-r--r--fs/cifs/cifspdu.h3
-rw-r--r--fs/cifs/cifsproto.h13
-rw-r--r--fs/cifs/connect.c37
-rw-r--r--fs/cifs/dir.c10
-rw-r--r--fs/cifs/file.c106
-rw-r--r--fs/cifs/fs_context.c10
-rw-r--r--fs/cifs/inode.c27
-rw-r--r--fs/cifs/ioctl.c179
-rw-r--r--fs/cifs/link.c13
-rw-r--r--fs/cifs/misc.c94
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb2ops.c4
-rw-r--r--fs/cifs/smb2pdu.c18
-rw-r--r--fs/cifs/trace.h29
-rw-r--r--fs/cifs/xattr.c4
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/configfs/file.c4
-rw-r--r--fs/configfs/inode.c4
-rw-r--r--fs/configfs/item.c4
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/configfs/symlink.c4
-rw-r--r--fs/d_path.c10
-rw-r--r--fs/dax.c43
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/dlm/config.c86
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lockspace.c20
-rw-r--r--fs/dlm/lowcomms.c194
-rw-r--r--fs/dlm/lowcomms.h5
-rw-r--r--fs/dlm/midcomms.c33
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/ecryptfs/crypto.c29
-rw-r--r--fs/ecryptfs/debug.c4
-rw-r--r--fs/ecryptfs/dentry.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h19
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c174
-rw-r--r--fs/ecryptfs/keystore.c15
-rw-r--r--fs/ecryptfs/kthread.c3
-rw-r--r--fs/ecryptfs/main.c30
-rw-r--r--fs/ecryptfs/messaging.c14
-rw-r--r--fs/ecryptfs/miscdev.c3
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/ecryptfs/read_write.c4
-rw-r--r--fs/ecryptfs/super.c8
-rw-r--r--fs/erofs/zmap.c21
-rw-r--r--fs/eventpoll.c6
-rw-r--r--fs/ext2/namei.c3
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/dir.c41
-rw-r--r--fs/ext4/ext4.h107
-rw-r--r--fs/ext4/fast_commit.c8
-rw-r--r--fs/ext4/file.c25
-rw-r--r--fs/ext4/hash.c25
-rw-r--r--fs/ext4/ialloc.c51
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c27
-rw-r--r--fs/ext4/inode.c8
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c592
-rw-r--r--fs/ext4/mballoc.h24
-rw-r--r--fs/ext4/migrate.c6
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/namei.c248
-rw-r--r--fs/ext4/super.c116
-rw-r--r--fs/ext4/sysfs.c8
-rw-r--r--fs/ext4/verity.c10
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/f2fs/Kconfig16
-rw-r--r--fs/f2fs/acl.c1
-rw-r--r--fs/f2fs/checkpoint.c9
-rw-r--r--fs/f2fs/compress.c70
-rw-r--r--fs/f2fs/compress.h0
-rw-r--r--fs/f2fs/data.c146
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c5
-rw-r--r--fs/f2fs/f2fs.h57
-rw-r--r--fs/f2fs/file.c54
-rw-r--r--fs/f2fs/gc.c95
-rw-r--r--fs/f2fs/gc.h6
-rw-r--r--fs/f2fs/inline.c3
-rw-r--r--fs/f2fs/inode.c3
-rw-r--r--fs/f2fs/namei.c6
-rw-r--r--fs/f2fs/node.c19
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c3
-rw-r--r--fs/f2fs/segment.c184
-rw-r--r--fs/f2fs/segment.h16
-rw-r--r--fs/f2fs/super.c102
-rw-r--r--fs/f2fs/sysfs.c47
-rw-r--r--fs/f2fs/verity.c75
-rw-r--r--fs/f2fs/xattr.c1
-rw-r--r--fs/fat/fatent.c2
-rw-r--r--fs/file.c39
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fuse/acl.c7
-rw-r--r--fs/fuse/cuse.c12
-rw-r--r--fs/fuse/dev.c7
-rw-r--r--fs/fuse/file.c71
-rw-r--r--fs/fuse/fuse_i.h13
-rw-r--r--fs/fuse/inode.c10
-rw-r--r--fs/fuse/virtio_fs.c28
-rw-r--r--fs/fuse/xattr.c9
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c31
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/log.c6
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/hfsplus/extents.c7
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/hpfs.h3
-rw-r--r--fs/hugetlbfs/inode.c16
-rw-r--r--fs/inode.c9
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/iomap/direct-io.c24
-rw-r--r--fs/isofs/rock.c1
-rw-r--r--fs/jbd2/recovery.c5
-rw-r--r--fs/jbd2/transaction.c15
-rw-r--r--fs/jffs2/TODO37
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/scan.c2
-rw-r--r--fs/jffs2/summary.h16
-rw-r--r--fs/locks.c3
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/netfs/Kconfig2
-rw-r--r--fs/netfs/read_helper.c2
-rw-r--r--fs/nfs/callback_proc.c17
-rw-r--r--fs/nfs/client.c20
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c36
-rw-r--r--fs/nfs/export.c15
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/fs_context.c66
-rw-r--r--fs/nfs/inode.c418
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/io.c2
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs42proc.c77
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs4file.c6
-rw-r--r--fs/nfs/nfs4proc.c268
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c8
-rw-r--r--fs/nfs/nfs4trace.h47
-rw-r--r--fs/nfs/nfs4xdr.c66
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h22
-rw-r--r--fs/nfs/pagelist.c24
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c8
-rw-r--r--fs/nfs/write.c7
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfs4state.c415
-rw-r--r--fs/nfsd/nfs4xdr.c6
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfssvc.c8
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/xdr4.h6
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/namei.c3
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c30
-rw-r--r--fs/notify/fdinfo.c2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/acl.h4
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/aops.h4
-rw-r--r--fs/ocfs2/blockcheck.c6
-rw-r--r--fs/ocfs2/blockcheck.h4
-rw-r--r--fs/ocfs2/buffer_head_io.c4
-rw-r--r--fs/ocfs2/buffer_head_io.h4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/masklog.h4
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.h4
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h4
-rw-r--r--fs/ocfs2/cluster/quorum.c4
-rw-r--r--fs/ocfs2/cluster/quorum.h4
-rw-r--r--fs/ocfs2/cluster/sys.c4
-rw-r--r--fs/ocfs2/cluster/sys.h4
-rw-r--r--fs/ocfs2/cluster/tcp.c4
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h4
-rw-r--r--fs/ocfs2/dcache.c4
-rw-r--r--fs/ocfs2/dcache.h4
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dir.h4
-rw-r--r--fs/ocfs2/dlm/dlmapi.h4
-rw-r--r--fs/ocfs2/dlm/dlmast.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h4
-rw-r--r--fs/ocfs2/dlm/dlmlock.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c11
-rw-r--r--fs/ocfs2/dlm/dlmthread.c4
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c4
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h4
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/dlmglue.h4
-rw-r--r--fs/ocfs2/export.c4
-rw-r--r--fs/ocfs2/export.h4
-rw-r--r--fs/ocfs2/extent_map.c4
-rw-r--r--fs/ocfs2/extent_map.h4
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/file.h4
-rw-r--r--fs/ocfs2/filecheck.c4
-rw-r--r--fs/ocfs2/filecheck.h4
-rw-r--r--fs/ocfs2/heartbeat.c4
-rw-r--r--fs/ocfs2/heartbeat.h4
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c4
-rw-r--r--fs/ocfs2/locks.h4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/move_extents.h4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/namei.h4
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h4
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h4
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h4
-rw-r--r--fs/ocfs2/refcounttree.c4
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c4
-rw-r--r--fs/ocfs2/reservations.h4
-rw-r--r--fs/ocfs2/resize.c4
-rw-r--r--fs/ocfs2/resize.h4
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/slot_map.h4
-rw-r--r--fs/ocfs2/stack_o2cb.c40
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h4
-rw-r--r--fs/ocfs2/suballoc.c4
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/super.h4
-rw-r--r--fs/ocfs2/symlink.c4
-rw-r--r--fs/ocfs2/symlink.h4
-rw-r--r--fs/ocfs2/sysfile.c4
-rw-r--r--fs/ocfs2/sysfile.h4
-rw-r--r--fs/ocfs2/uptodate.c4
-rw-r--r--fs/ocfs2/uptodate.h4
-rw-r--r--fs/ocfs2/xattr.c4
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/orangefs/file.c34
-rw-r--r--fs/orangefs/inode.c122
-rw-r--r--fs/orangefs/orangefs-mod.c2
-rw-r--r--fs/overlayfs/copy_up.c3
-rw-r--r--fs/overlayfs/file.c21
-rw-r--r--fs/overlayfs/inode.c18
-rw-r--r--fs/overlayfs/namei.c1
-rw-r--r--fs/overlayfs/overlayfs.h37
-rw-r--r--fs/overlayfs/readdir.c12
-rw-r--r--fs/overlayfs/super.c66
-rw-r--r--fs/overlayfs/util.c33
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/generic.c13
-rw-r--r--fs/proc/inode.c18
-rw-r--r--fs/proc/proc_sysctl.c15
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/reiserfs/procfs.c10
-rw-r--r--fs/seq_file.c18
-rw-r--r--fs/signalfd.c23
-rw-r--r--fs/squashfs/file.c6
-rw-r--r--fs/stat.c8
-rw-r--r--fs/super.c1
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/sb.c3
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/udf/namei.c3
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/unicode/.gitignore4
-rw-r--r--fs/userfaultfd.c149
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c46
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c17
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c12
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c46
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c16
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c17
-rw-r--r--fs/xfs/scrub/agheader.c7
-rw-r--r--fs/xfs/scrub/common.c4
-rw-r--r--fs/xfs/scrub/fscounters.c40
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c98
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_inode.c29
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_inode_item_recover.c6
-rw-r--r--fs/xfs/xfs_ioctl.c101
-rw-r--r--fs/xfs/xfs_log.c10
-rw-r--r--fs/xfs/xfs_message.h2
-rw-r--r--fs/xfs/xfs_mount.c15
-rw-r--r--fs/xfs/xfs_mount.h6
-rw-r--r--fs/xfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/xfs_reflink.c3
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans.h15
403 files changed, 5829 insertions, 3482 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 39def020a074..cdb99507ef33 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = {
NULL,
};
-static struct attribute_group v9fs_attr_group = {
+static const struct attribute_group v9fs_attr_group = {
.attrs = v9fs_attrs,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 649f04f112dc..59c32c9b799f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
* to work.
*/
writeback_fid = v9fs_writeback_fid(file_dentry(file));
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
+ if (IS_ERR(writeback_fid)) {
+ err = PTR_ERR(writeback_fid);
mutex_unlock(&v9inode->v_mutex);
goto out_error;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 97e7b77c9309..141a856c50e7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -223,10 +223,13 @@ config TMPFS_INODE64
If unsure, say N.
+config ARCH_SUPPORTS_HUGETLBFS
+ def_bool n
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
- SYS_SUPPORTS_HUGETLBFS || BROKEN
+ ARCH_SUPPORTS_HUGETLBFS || BROKEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -335,8 +338,8 @@ config NFS_COMMON
default y
config NFS_V4_2_SSC_HELPER
- tristate
- default y if NFS_V4=y || NFS_FS=y
+ bool
+ default y if NFS_V4_2
source "net/sunrpc/Kconfig"
source "fs/ceph/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c6f1c8c1934e..06fb7a93a1bd 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
config BINFMT_FLAT_OLD_ALWAYS_RAM
bool
+config BINFMT_FLAT_NO_DATA_START_OFFSET
+ bool
+
config BINFMT_FLAT_OLD
bool "Enable support for very old legacy flat binaries"
depends on BINFMT_FLAT
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a4e9e6e07e93..d3c6bb22c5f4 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call)
return ret;
call->unmarshall++;
+ fallthrough;
+
case 5:
break;
}
@@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
+ fallthrough;
case 3:
break;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 117df15e5367..78719f2f567e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1419,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFDIR | mode;
@@ -1500,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
@@ -1636,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
/* Try to make sure we have a callback promise on the victim. */
@@ -1718,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
@@ -1792,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
@@ -1914,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op)
new_inode = d_inode(new_dentry);
if (new_inode) {
spin_lock(&new_inode->i_lock);
- if (new_inode->i_nlink > 0)
+ if (S_ISDIR(new_inode->i_mode))
+ clear_nlink(new_inode);
+ else if (new_inode->i_nlink > 0)
drop_nlink(new_inode);
spin_unlock(&new_inode->i_lock);
}
@@ -1987,6 +1994,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].modification = true;
+ op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 04f75a44f243..dae9a57d7ec0 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 1, dvnode);
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].modification = true;
+ op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
@@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->file[1].op_unlinked = true;
op->file[1].update_ctime = true;
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 2cb0951acca6..d222dfbe976b 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *
vp->cb_break_before = afs_calc_vnode_cb_break(vnode);
if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
op->flags |= AFS_OPERATION_CUR_ONLY;
+ if (vp->modification)
+ set_bit(AFS_VNODE_MODIFYING, &vnode->flags);
}
if (vp->fid.vnode)
@@ -225,6 +227,10 @@ int afs_put_operation(struct afs_operation *op)
if (op->ops && op->ops->put)
op->ops->put(op);
+ if (op->file[0].modification)
+ clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags);
+ if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
+ clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
iput(&op->file[0].vnode->vfs_inode);
if (op->file[1].put_vnode)
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 2f695a260442..dd3f45d906d2 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -388,6 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
+ fallthrough;
case 5:
break;
@@ -1408,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
+ fallthrough;
case 8:
break;
@@ -1845,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 6:
break;
@@ -1979,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 4:
break;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3a129b9fd9b8..80b6c8d967d5 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -294,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
} else if (vp->scb.have_status) {
- if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version &&
- vp->speculative)
+ if (vp->speculative &&
+ (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) ||
+ vp->dv_before != vnode->status.data_version))
/* Ignore the result of a speculative bulk status fetch
* if it splits around a modification op, thereby
* appearing to regress the data version.
@@ -911,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
op->ctime = attr->ia_ctime;
op->file[0].update_ctime = 1;
+ op->file[0].modification = true;
op->ops = &afs_setattr_operation;
ret = afs_do_sync_operation(op);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 52157a05796a..5ed416f4ff33 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -645,6 +645,7 @@ struct afs_vnode {
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
+#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
@@ -762,6 +763,7 @@ struct afs_vnode_param {
bool set_size:1; /* Must update i_size */
bool op_unlinked:1; /* True if file was unlinked by op */
bool speculative:1; /* T if speculative status fetch (no vnode lock) */
+ bool modification:1; /* Set if the content gets modified */
};
/*
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index dc9327332f06..00fca3c66ba6 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ret < 0)
return ret;
call->unmarshall = 6;
+ fallthrough;
case 6:
break;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index dc66ff15dd16..3edb6204b937 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -377,6 +377,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..76ce0cc3ee4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
- if (flags & MREMAP_DONTUNMAP)
- return -EINVAL;
-
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 054f97b07754..918826eaceea 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -87,6 +87,7 @@ struct autofs_wait_queue {
autofs_wqt_t wait_queue_token;
/* We use the following to see what we are waiting for */
struct qstr name;
+ u32 offset;
u32 dev;
u64 ino;
kuid_t uid;
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index a1c7701007e7..b3fefd6237c3 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
+ if (d_is_symlink(dentry)) {
pr_debug("checking symlink %p %pd\n", dentry, dentry);
/* Forced expire, user space handles busy mounts */
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 5ced859dac53..16b5fca0626e 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
while (wq) {
nwq = wq->next;
wq->status = -ENOENT; /* Magic is gone - report failure */
- kfree(wq->name.name);
+ kfree(wq->name.name - wq->offset);
wq->name.name = NULL;
wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
@@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi,
fput(pipe);
}
-static int autofs_getpath(struct autofs_sb_info *sbi,
- struct dentry *dentry, char *name)
-{
- struct dentry *root = sbi->sb->s_root;
- struct dentry *tmp;
- char *buf;
- char *p;
- int len;
- unsigned seq;
-
-rename_retry:
- buf = name;
- len = 0;
-
- seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- spin_lock(&sbi->fs_lock);
- for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
- len += tmp->d_name.len + 1;
-
- if (!len || --len > NAME_MAX) {
- spin_unlock(&sbi->fs_lock);
- rcu_read_unlock();
- if (read_seqretry(&rename_lock, seq))
- goto rename_retry;
- return 0;
- }
-
- *(buf + len) = '\0';
- p = buf + len - dentry->d_name.len;
- strncpy(p, dentry->d_name.name, dentry->d_name.len);
-
- for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) {
- *(--p) = '/';
- p -= tmp->d_name.len;
- strncpy(p, tmp->d_name.name, tmp->d_name.len);
- }
- spin_unlock(&sbi->fs_lock);
- rcu_read_unlock();
- if (read_seqretry(&rename_lock, seq))
- goto rename_retry;
-
- return len;
-}
-
static struct autofs_wait_queue *
autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
{
@@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
struct qstr qstr;
char *name;
int status, ret, type;
+ unsigned int offset = 0;
pid_t pid;
pid_t tgid;
@@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi,
return -ENOMEM;
/* If this is a direct mount request create a dummy name */
- if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
+ if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) {
+ qstr.name = name;
qstr.len = sprintf(name, "%p", dentry);
- else {
- qstr.len = autofs_getpath(sbi, dentry, name);
- if (!qstr.len) {
+ } else {
+ char *p = dentry_path_raw(dentry, name, NAME_MAX);
+ if (IS_ERR(p)) {
kfree(name);
return -ENOENT;
}
+ qstr.name = ++p; // skip the leading slash
+ qstr.len = strlen(p);
+ offset = p - name;
}
- qstr.name = name;
qstr.hash = full_name_hash(dentry, name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
- kfree(qstr.name);
+ kfree(name);
return -EINTR;
}
@@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
if (ret <= 0) {
if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);
- kfree(qstr.name);
+ kfree(name);
return ret;
}
@@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
/* Create a new wait queue */
wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
- kfree(qstr.name);
+ kfree(name);
mutex_unlock(&sbi->wq_mutex);
return -ENOMEM;
}
@@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
sbi->queues = wq;
init_waitqueue_head(&wq->queue);
memcpy(&wq->name, &qstr, sizeof(struct qstr));
+ wq->offset = offset;
wq->dev = autofs_get_dev(sbi);
wq->ino = autofs_get_ino(sbi);
wq->uid = current_uid();
@@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
(unsigned long) wq->wait_queue_token, wq->name.len,
wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
- kfree(qstr.name);
+ kfree(name);
}
/*
@@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi,
}
*wql = wq->next; /* Unlink from chain */
- kfree(wq->name.name);
+ kfree(wq->name.name - wq->offset);
wq->name.name = NULL; /* Do not wait on this queue */
wq->status = status;
wake_up(&wq->queue);
diff --git a/fs/befs/TODO b/fs/befs/TODO
deleted file mode 100644
index 3250921aa2e6..000000000000
--- a/fs/befs/TODO
+++ /dev/null
@@ -1,14 +0,0 @@
-TODO
-==========
-
-* Convert comments to the Kernel-Doc format.
-
-* Befs_fs.h has gotten big and messy. No reason not to break it up into
- smaller peices.
-
-* See if Alexander Viro's option parser made it into the kernel tree.
- Use that if we can. (include/linux/parser.h)
-
-* See if we really need separate types for on-disk and in-memory
- representations of the superblock and inode.
-
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..a1072c6a2341 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
#define MAX_SHARED_LIBS (1)
#endif
+#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
+#define DATA_START_OFFSET_WORDS (0)
+#else
+#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS)
+#endif
+
struct lib_info {
struct {
unsigned long start_code; /* Start of text segment */
@@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+ len = data_len + extra +
+ DATA_START_OFFSET_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
- MAX_SHARED_LIBS * sizeof(unsigned long),
+ DATA_START_OFFSET_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
+ len = text_len + data_len + extra +
+ DATA_START_OFFSET_WORDS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm,
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
- MAX_SHARED_LIBS * sizeof(u32),
+ DATA_START_OFFSET_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
@@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
- MAX_SHARED_LIBS * sizeof(u32));
+ DATA_START_OFFSET_WORDS * sizeof(u32));
goto err;
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a5244e08b6c8..6cc4d4cfe0c2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
return;
invalidate_bh_lrus();
@@ -1244,6 +1244,9 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
+
rescan:
if (bdev->bd_part_count)
return -EBUSY;
@@ -1298,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
struct gendisk *disk = bdev->bd_disk;
int ret = 0;
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
+
if (!bdev->bd_openers) {
if (!bdev_is_partition(bdev)) {
ret = 0;
@@ -1332,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
whole->bd_part_count++;
mutex_unlock(&whole->bd_mutex);
- if (!(disk->flags & GENHD_FL_UP) ||
- !bdev_nr_sectors(bdev)) {
+ if (!bdev_nr_sectors(bdev)) {
__blkdev_put(whole, mode, 1);
bdput(whole);
return -ENXIO;
@@ -1364,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct block_device *bdev;
struct gendisk *disk;
- down_read(&bdev_lookup_sem);
bdev = bdget(dev);
if (!bdev) {
- up_read(&bdev_lookup_sem);
blk_request_module(dev);
- down_read(&bdev_lookup_sem);
-
bdev = bdget(dev);
if (!bdev)
- goto unlock;
+ return NULL;
}
disk = bdev->bd_disk;
@@ -1383,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
goto put_disk;
if (!try_module_get(bdev->bd_disk->fops->owner))
goto put_disk;
- up_read(&bdev_lookup_sem);
return bdev;
put_disk:
put_disk(disk);
bdput:
bdput(bdev);
-unlock:
- up_read(&bdev_lookup_sem);
return NULL;
}
@@ -1677,6 +1675,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
+ size_t shorted = 0;
ssize_t ret;
if (bdev_read_only(I_BDEV(bd_inode)))
@@ -1694,12 +1693,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
- iov_iter_truncate(from, size - iocb->ki_pos);
+ size -= iocb->ki_pos;
+ if (iov_iter_count(from) > size) {
+ shorted = iov_iter_count(from) - size;
+ iov_iter_truncate(from, size);
+ }
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
+ iov_iter_reexpand(from, iov_iter_count(from) + shorted);
blk_finish_plug(&plug);
return ret;
}
@@ -1711,13 +1715,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
+ size_t shorted = 0;
+ ssize_t ret;
if (pos >= size)
return 0;
size -= pos;
- iov_iter_truncate(to, size);
- return generic_file_read_iter(iocb, to);
+ if (iov_iter_count(to) > size) {
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
+ }
+
+ ret = generic_file_read_iter(iocb, to);
+ iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 17f93fd28f7e..1346d698463a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -28,6 +28,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -349,6 +350,7 @@ static void end_compressed_bio_write(struct bio *bio)
*/
inode = cb->inode;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+ btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1,
bio->bi_status == BLK_STS_OK);
@@ -401,6 +403,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
u64 first_byte = disk_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+ const bool use_append = btrfs_use_zone_append(inode, disk_start);
+ const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -418,10 +422,31 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->nr_pages = nr_pages;
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (use_append) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct block_device *bdev;
+
+ em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE);
+ if (IS_ERR(em)) {
+ kfree(cb);
+ bio_put(bio);
+ return BLK_STS_NOTSUPP;
+ }
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ bdev = map->stripes[0].dev->bdev;
+
+ bio_set_dev(bio, bdev);
+ free_extent_map(em);
+ }
+
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
kthread_associate_blkcg(blkcg_css);
@@ -432,6 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
+ int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
@@ -439,9 +465,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
+ /*
+ * Page can only be added to bio if the current bio fits in
+ * stripe.
+ */
+ if (!submit) {
+ if (pg_index == 0 && use_append)
+ len = bio_add_zone_append_page(bio, page,
+ PAGE_SIZE, 0);
+ else
+ len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ }
+
page->mapping = NULL;
- if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
+ if (submit || len < PAGE_SIZE) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -465,11 +502,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
}
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
+ /*
+ * Use bio_add_page() to ensure the bio has at least one
+ * page.
+ */
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -591,16 +632,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
free_extent_map(em);
if (page->index == end_index) {
- char *userpage;
size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
zeros = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, zeros);
+ memzero_page(page, zero_offset, zeros);
flush_dcache_page(page);
- kunmap_atomic(userpage);
}
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f83fd3cbf243..9fb76829a281 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a28314189b4..3d5c35e4cb76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
+ struct btrfs_device *device = stripe->dev;
- if (!stripe->dev->bdev) {
+ if (!device->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ continue;
+
ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
@@ -1864,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f2d1bb234377..dee2dafbc872 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
if (page->index == last_byte >> PAGE_SHIFT) {
- char *userpage;
size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, iosize);
+ memzero_page(page, zero_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
}
}
begin_page_read(fs_info, page);
@@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 disk_bytenr;
if (cur >= last_byte) {
- char *userpage;
struct extent_state *cached = NULL;
iosize = PAGE_SIZE - pg_offset;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
+ memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
@@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- char *userpage;
struct extent_state *cached = NULL;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
+ memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -3762,7 +3753,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
/* Note that em_end from extent_map_end() is exclusive */
iosize = min(em_end, end + 1) - cur;
- if (btrfs_use_zone_append(inode, em))
+ if (btrfs_use_zone_append(inode, em->block_start))
opf = REQ_OP_ZONE_APPEND;
free_extent_map(em);
@@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (page->index == end_index) {
- char *userpage;
-
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_SIZE - pg_offset);
- kunmap_atomic(userpage);
+ memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
flush_dcache_page(page);
}
@@ -5210,7 +5196,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
- u64 off = start;
+ u64 off;
u64 max = start + len;
u32 flags = 0;
u32 found_type;
@@ -5245,6 +5231,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out_free_ulist;
}
+ /*
+ * We can't initialize that to 'start' as this could miss extents due
+ * to extent item merging
+ */
+ off = 0;
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 294602f139ef..441cee7fbb62 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
@@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
@@ -981,26 +1011,10 @@ again:
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- found_next = 1;
- goto insert;
- }
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
@@ -1056,8 +1070,48 @@ extend_csum:
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 864c08d08a35..3b10d98b4ebb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
return ret;
}
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
- /*
- * If we are doing a fast fsync we can not bail out if the inode's
- * last_trans is <= then the last committed transaction, because we only
- * update the last_trans of the inode during ordered extent completion,
- * and for a fast fsync we don't wait for that, we only wait for the
- * writeback to complete.
- */
smp_mb();
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
- (full_sync || list_empty(&ctx.ordered_extents)))) {
+ if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e54466fc101f..4806295116d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *block_group;
struct rb_node *node;
- int ret;
+ int ret = 0;
btrfs_info(fs_info, "cleaning free space cache v1");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b21d491b3adc..46f392943f4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -646,17 +646,12 @@ again:
if (!ret) {
unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
- char *kaddr;
/* zero the tail end of the last page, we might be
* sending it down to disk
*/
- if (offset) {
- kaddr = kmap_atomic(page);
- memset(kaddr + offset, 0,
- PAGE_SIZE - offset);
- kunmap_atomic(kaddr);
- }
+ if (offset)
+ memzero_page(page, offset, PAGE_SIZE - offset);
will_compress = 1;
}
}
@@ -3005,6 +3000,18 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
@@ -3246,6 +3253,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
+ cond_resched_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -4833,7 +4841,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
@@ -4925,15 +4932,13 @@ again:
if (offset != blocksize) {
if (!len)
len = blocksize - offset;
- kaddr = kmap(page);
if (front)
- memset(kaddr + (block_start - page_offset(page)),
- 0, offset);
+ memzero_page(page, (block_start - page_offset(page)),
+ offset);
else
- memset(kaddr + (block_start - page_offset(page)) + offset,
- 0, len);
+ memzero_page(page, (block_start - page_offset(page)) + offset,
+ len);
flush_dcache_page(page);
- kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
@@ -6832,11 +6837,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size + pg_offset < PAGE_SIZE) {
- char *map = kmap(page);
- memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
- kunmap(page);
- }
+ if (max_size + pg_offset < PAGE_SIZE)
+ memzero_page(page, pg_offset + max_size,
+ PAGE_SIZE - max_size - pg_offset);
kfree(tmp);
return ret;
}
@@ -7795,7 +7798,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
- if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
iomap->flags |= IOMAP_F_ZONE_APPEND;
free_extent_map(em);
@@ -8506,7 +8509,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
unsigned long zero_start;
loff_t size;
vm_fault_t ret;
@@ -8620,10 +8622,8 @@ again:
zero_start = PAGE_SIZE;
if (zero_start != PAGE_SIZE) {
- kaddr = kmap(page);
- memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
- kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
@@ -9088,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9147,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9162,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -9691,7 +9696,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
@@ -9704,7 +9709,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- return start_delalloc_inodes(root, &wbc, true, false);
+ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ee1dbabb5d3c..5dc2fd843ae3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
if (!fa->flags_valid) {
/* 1 item for the inode */
trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
goto update_flags;
}
@@ -907,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
*/
btrfs_drew_read_lock(&root->snapshot_lock);
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
goto out;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 07b0b4218791..6c413bb451a3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
if (pre)
ret = clone_ordered_extent(ordered, 0, pre);
- if (post)
+ if (ret == 0 && post)
ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
post);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2319c923c9e6..3ded812f522c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- /* Can't hold an open transaction or we run the risk of deadlocking */
- ASSERT(current->journal_info == NULL ||
- current->journal_info == BTRFS_SEND_TRANS_STUB);
- if (WARN_ON(current->journal_info &&
- current->journal_info != BTRFS_SEND_TRANS_STUB))
+ /*
+ * Can't hold an open transaction or we run the risk of deadlocking,
+ * and can't either be under the context of a send operation (where
+ * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
+ * would result in a crash when starting a transaction and does not
+ * make sense either (send is a read-only operation).
+ */
+ ASSERT(current->journal_info == NULL);
+ if (WARN_ON(current->journal_info))
return 0;
/*
@@ -3562,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f4ec06b53aa0..9178da07cc9c 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size) {
- char *map;
-
- map = kmap(page);
- memset(map + datal, 0, block_size - datal);
+ memzero_page(page, datal, block_size - datal);
flush_dcache_page(page);
- kunmap(page);
}
SetPageUptodate(page);
@@ -207,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
@@ -226,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
@@ -244,11 +234,13 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -305,6 +297,21 @@ out:
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
/**
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 55741adf9071..bd69db72acc5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
int i;
if (root) {
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
for (i = 0; i < sctx->clone_roots_cnt; i++) {
root = sctx->clone_roots[i].root;
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f67721d82e5d..362d14db1e38 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -6061,7 +6064,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
*/
- if (btrfs_inode_in_log(inode, trans->transid) ||
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
@@ -6462,6 +6466,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
(!old_dir || old_dir->logged_trans < trans->transid))
return;
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure the next log attempt on the directory
+ * is not skipped and logs the inode again. This is because the log may
+ * not currently be authoritative for a range including the old
+ * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
+ * sure after a log replay we do not end up with both the new and old
+ * dentries around (in case the inode is a directory we would have a
+ * directory with two hard links and 2 inode references for different
+ * parents). The next log attempt of old_dir will happen at
+ * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
+ * below, because we have previously set inode->last_unlink_trans to the
+ * current transaction ID, either here or at btrfs_record_unlink_dir() in
+ * case inode is a directory.
+ */
+ if (old_dir)
+ old_dir->logged_trans = 0;
+
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a1ead0c4a31..47d27059d064 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1459,7 +1459,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
/* Given hole range was invalid (outside of device) */
if (ret == -ERANGE) {
*hole_start += *hole_size;
- *hole_size = false;
+ *hole_size = 0;
return true;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index d524acf7b3e5..c3fa7d3fa770 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes_left;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes_left = destlen;
@@ -455,9 +454,7 @@ next:
* end of the inline extent (destlen) to the end of the page
*/
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 70b23a0d03b1..1bb8ee97aae0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ ret = -EIO;
+ goto out;
+ }
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
@@ -1273,7 +1278,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
spin_unlock(&trans->releasing_ebs_lock);
}
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
@@ -1288,7 +1293,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
if (!is_data_inode(&inode->vfs_inode))
return false;
- cache = btrfs_lookup_block_group(fs_info, em->block_start);
+ cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
return false;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 5e41a74a9cb2..e55d32595c2c 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -53,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
void btrfs_free_redirty_list(struct btrfs_transaction *trans);
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
struct bio *bio);
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
@@ -152,8 +152,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb) { }
static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode,
- struct extent_map *em)
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
return false;
}
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 8e9626d63976..3e26b466476a 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
size_t ret2;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
stream = ZSTD_initDStream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ret = 0;
finish:
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 0cb7ffd4977c..ea48c01fb76b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
pgoff_t index;
int sizebits;
- sizebits = -1;
- do {
- sizebits++;
- } while ((size << sizebits) < PAGE_SIZE);
-
+ sizebits = PAGE_SHIFT - __ffs(size);
index = block >> sizebits;
/*
@@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh)
int i;
check_irqs_on();
+ /*
+ * the refcount of buffer_head in bh_lru prevents dropping the
+ * attached page(i.e., try_to_free_buffers) so it could cause
+ * failing page migration.
+ * Skip putting upcoming bh into bh_lru until migration is done.
+ */
+ if (lru_cache_disabled())
+ return;
+
bh_lru_lock();
b = this_cpu_ptr(&bh_lrus);
@@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
}
EXPORT_SYMBOL(__bread_gfp);
+static void __invalidate_bh_lrus(struct bh_lru *b)
+{
+ int i;
+
+ for (i = 0; i < BH_LRU_SIZE; i++) {
+ brelse(b->bhs[i]);
+ b->bhs[i] = NULL;
+ }
+}
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
* This doesn't race because it runs in each cpu either in irq
@@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp);
static void invalidate_bh_lru(void *arg)
{
struct bh_lru *b = &get_cpu_var(bh_lrus);
- int i;
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
+ __invalidate_bh_lrus(b);
put_cpu_var(bh_lrus);
}
-static bool has_bh_in_lru(int cpu, void *dummy)
+bool has_bh_in_lru(int cpu, void *dummy)
{
struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
int i;
@@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void)
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
+void invalidate_bh_lrus_cpu(int cpu)
+{
+ struct bh_lru *b;
+
+ bh_lru_lock();
+ b = per_cpu_ptr(&bh_lrus, cpu);
+ __invalidate_bh_lrus(b);
+ bh_lru_unlock();
+}
+
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 471e40156065..94df854147d3 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -6,6 +6,7 @@ config CEPH_FS
select LIBCRC32C
select CRYPTO_AES
select CRYPTO
+ select NETFS_SUPPORT
default n
help
Choose Y or M here to include support for mounting the
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26e66436f005..c1570fada3d8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -12,6 +12,7 @@
#include <linux/signal.h>
#include <linux/iversion.h>
#include <linux/ktime.h>
+#include <linux/netfs.h>
#include "super.h"
#include "mds_client.h"
@@ -61,6 +62,9 @@
(CONGESTION_ON_THRESH(congestion_kb) - \
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct page *page, void **_fsdata);
+
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
if (PagePrivate(page))
@@ -124,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page)
* PagePrivate so that we get invalidatepage callback.
*/
BUG_ON(PagePrivate(page));
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
+ attach_page_private(page, snapc);
ret = __set_page_dirty_nobuffers(page);
WARN_ON(!PageLocked(page));
@@ -144,19 +147,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
{
struct inode *inode;
struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_snap_context *snapc;
+
+ wait_on_page_fscache(page);
inode = page->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != PAGE_SIZE) {
+ if (offset != 0 || length != thp_size(page)) {
dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
inode, page, page->index, offset, length);
return;
}
- ceph_invalidate_fscache_page(inode, page);
-
WARN_ON(!PageLocked(page));
if (!PagePrivate(page))
return;
@@ -164,333 +167,222 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
dout("%p invalidatepage %p idx %lu full dirty page\n",
inode, page, page->index);
+ snapc = detach_page_private(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
}
-static int ceph_releasepage(struct page *page, gfp_t g)
+static int ceph_releasepage(struct page *page, gfp_t gfp)
{
dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
page, page->index, PageDirty(page) ? "" : "not ");
- /* Can we release the page from the cache? */
- if (!ceph_release_fscache_page(page, g))
- return 0;
-
+ if (PageFsCache(page)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+ return 0;
+ wait_on_page_fscache(page);
+ }
return !PagePrivate(page);
}
-/* read a single page, without unlocking it. */
-static int ceph_do_readpage(struct file *filp, struct page *page)
+static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = rreq->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- struct ceph_vino vino = ceph_vino(inode);
- int err = 0;
- u64 off = page_offset(page);
- u64 len = PAGE_SIZE;
-
- if (off >= i_size_read(inode)) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- return 0;
- }
-
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0)
- return -EINVAL;
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- return 0;
- }
-
- err = ceph_readpage_from_fscache(inode, page);
- if (err == 0)
- return -EINPROGRESS;
-
- dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
- vino.ino, vino.snap, filp, off, len, page, page->index);
- req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ struct ceph_file_layout *lo = &ci->i_layout;
+ u32 blockoff;
+ u64 blockno;
- osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+ /* Expand the start downward */
+ blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+ rreq->start = blockno * lo->stripe_unit;
+ rreq->len += blockoff;
- err = ceph_osdc_start_request(osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, err);
-
- ceph_osdc_put_request(req);
- dout("readpage result %d\n", err);
-
- if (err == -ENOENT)
- err = 0;
- if (err < 0) {
- ceph_fscache_readpage_cancel(inode, page);
- if (err == -EBLOCKLISTED)
- fsc->blocklisted = true;
- goto out;
- }
- if (err < PAGE_SIZE)
- /* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_SIZE);
- else
- flush_dcache_page(page);
-
- SetPageUptodate(page);
- ceph_readpage_to_fscache(inode, page);
-
-out:
- return err < 0 ? err : 0;
+ /* Now, round up the length to the next block */
+ rreq->len = roundup(rreq->len, lo->stripe_unit);
}
-static int ceph_readpage(struct file *filp, struct page *page)
+static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
{
- int r = ceph_do_readpage(filp, page);
- if (r != -EINPROGRESS)
- unlock_page(page);
- else
- r = 0;
- return r;
+ struct inode *inode = subreq->rreq->mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Truncate the extent at the end of the current block */
+ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+ &objno, &objoff, &xlen);
+ subreq->len = min(xlen, fsc->mount_options->rsize);
+ return true;
}
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req)
+static void finish_netfs_read(struct ceph_osd_request *req)
{
- struct inode *inode = req->r_inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_data *osd_data;
- int rc = req->r_result <= 0 ? req->r_result : 0;
- int bytes = req->r_result >= 0 ? req->r_result : 0;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct netfs_read_subrequest *subreq = req->r_priv;
int num_pages;
- int i;
+ int err = req->r_result;
- dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
- if (rc == -EBLOCKLISTED)
- ceph_inode_to_client(inode)->blocklisted = true;
+ ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
- /* unlock all pages, zeroing any data we didn't read */
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- for (i = 0; i < num_pages; i++) {
- struct page *page = osd_data->pages[i];
-
- if (rc < 0 && rc != -ENOENT) {
- ceph_fscache_readpage_cancel(inode, page);
- goto unlock;
- }
- if (bytes < (int)PAGE_SIZE) {
- /* zero (remainder of) page */
- int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_SIZE);
- }
- dout("finish_read %p uptodate %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- ceph_readpage_to_fscache(inode, page);
-unlock:
- unlock_page(page);
- put_page(page);
- bytes -= PAGE_SIZE;
- }
+ dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
+ subreq->len, i_size_read(req->r_inode));
- ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
+ /* no object means success but no data */
+ if (err == -ENOENT)
+ err = 0;
+ else if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+
+ if (err >= 0 && err < subreq->len)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+
+ netfs_subreq_terminated(subreq, err, true);
- kfree(osd_data->pages);
+ num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
+ ceph_put_page_vector(osd_data->pages, num_pages, false);
+ iput(req->r_inode);
}
-/*
- * start an async read(ahead) operation. return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
- struct list_head *page_list, int max)
+static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
{
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = lru_to_page(page_list);
- struct ceph_vino vino;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- u64 off;
- u64 len;
- int i;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct iov_iter iter;
struct page **pages;
- pgoff_t next_index;
- int nr_pages = 0;
- int got = 0;
- int ret = 0;
-
- if (!rw_ctx) {
- /* caller of readpages does not hold buffer and read caps
- * (fadvise, madvise and readahead cases) */
- int want = CEPH_CAP_FILE_CACHE;
- ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
- true, &got);
- if (ret < 0) {
- dout("start_read %p, error getting cap\n", inode);
- } else if (!(got & want)) {
- dout("start_read %p, no cache cap\n", inode);
- ret = 0;
- }
- if (ret <= 0) {
- if (got)
- ceph_put_cap_refs(ci, got);
- while (!list_empty(page_list)) {
- page = lru_to_page(page_list);
- list_del(&page->lru);
- put_page(page);
- }
- return ret;
- }
- }
-
- off = (u64) page_offset(page);
+ size_t page_off;
+ int err = 0;
+ u64 len = subreq->len;
- /* count pages */
- next_index = page->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index != next_index)
- break;
- nr_pages++;
- next_index++;
- if (max && nr_pages == max)
- break;
- }
- len = nr_pages << PAGE_SHIFT;
- dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
- off, len);
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
- 0, 1, CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ, NULL,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
+ 0, 1, CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
+ NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
if (IS_ERR(req)) {
- ret = PTR_ERR(req);
+ err = PTR_ERR(req);
+ req = NULL;
goto out;
}
- /* build page vector */
- nr_pages = calc_pages_for(0, len);
- pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_put;
- }
- for (i = 0; i < nr_pages; ++i) {
- page = list_entry(page_list->prev, struct page, lru);
- BUG_ON(PageLocked(page));
- list_del(&page->lru);
-
- dout("start_read %p adding %p idx %lu\n", inode, page,
- page->index);
- if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_KERNEL)) {
- ceph_fscache_uncache_page(inode, page);
- put_page(page);
- dout("start_read %p add_to_page_cache failed %p\n",
- inode, page);
- nr_pages = i;
- if (nr_pages > 0) {
- len = nr_pages << PAGE_SHIFT;
- osd_req_op_extent_update(req, 0, len);
- break;
- }
- goto out_pages;
- }
- pages[i] = page;
+ dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+ if (err < 0) {
+ dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
+ goto out;
}
+
+ /* should always give us a page-aligned read */
+ WARN_ON_ONCE(page_off);
+ len = err;
+
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
- req->r_callback = finish_read;
+ req->r_callback = finish_netfs_read;
+ req->r_priv = subreq;
req->r_inode = inode;
+ ihold(inode);
- dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto out_pages;
+ err = ceph_osdc_start_request(req->r_osdc, req, false);
+ if (err)
+ iput(inode);
+out:
ceph_osdc_put_request(req);
+ if (err)
+ netfs_subreq_terminated(subreq, err, false);
+ dout("%s: result %d\n", __func__, err);
+}
- /* After adding locked pages to page cache, the inode holds cache cap.
- * So we can drop our cap refs. */
- if (got)
- ceph_put_cap_refs(ci, got);
+static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
+{
+}
- return nr_pages;
+static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int got = (uintptr_t)priv;
-out_pages:
- for (i = 0; i < nr_pages; ++i) {
- ceph_fscache_readpage_cancel(inode, pages[i]);
- unlock_page(pages[i]);
- }
- ceph_put_page_vector(pages, nr_pages, false);
-out_put:
- ceph_osdc_put_request(req);
-out:
if (got)
ceph_put_cap_refs(ci, got);
- return ret;
}
+const struct netfs_read_request_ops ceph_netfs_read_ops = {
+ .init_rreq = ceph_init_rreq,
+ .is_cache_enabled = ceph_is_cache_enabled,
+ .begin_cache_operation = ceph_begin_cache_operation,
+ .issue_op = ceph_netfs_issue_op,
+ .expand_readahead = ceph_netfs_expand_readahead,
+ .clamp_length = ceph_netfs_clamp_length,
+ .check_write_begin = ceph_netfs_check_write_begin,
+ .cleanup = ceph_readahead_cleanup,
+};
-/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
+/* read a single page, without unlocking it. */
+static int ceph_readpage(struct file *file, struct page *page)
{
struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vino vino = ceph_vino(inode);
+ u64 off = page_offset(page);
+ u64 len = thp_size(page);
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ /*
+ * Uptodate inline data should have been added
+ * into page cache while getting Fcr caps.
+ */
+ if (off == 0) {
+ unlock_page(page);
+ return -EINVAL;
+ }
+ zero_user_segment(page, 0, thp_size(page));
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+
+ dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+ vino.ino, vino.snap, file, off, len, page, page->index);
+
+ return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
+}
+
+static void ceph_readahead(struct readahead_control *ractl)
+{
+ struct inode *inode = file_inode(ractl->file);
+ struct ceph_file_info *fi = ractl->file->private_data;
struct ceph_rw_context *rw_ctx;
- int rc = 0;
- int max = 0;
+ int got = 0;
+ int ret = 0;
if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
- return -EINVAL;
+ return;
- rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
- &nr_pages);
+ rw_ctx = ceph_find_rw_context(fi);
+ if (!rw_ctx) {
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ int want = CEPH_CAP_FILE_CACHE;
- if (rc == 0)
- goto out;
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0)
+ dout("start_read %p, error getting cap\n", inode);
+ else if (!(got & want))
+ dout("start_read %p, no cache cap\n", inode);
- rw_ctx = ceph_find_rw_context(fi);
- max = fsc->mount_options->rsize >> PAGE_SHIFT;
- dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
- inode, file, rw_ctx, nr_pages, max);
- while (!list_empty(page_list)) {
- rc = start_read(inode, rw_ctx, page_list, max);
- if (rc < 0)
- goto out;
+ if (ret <= 0)
+ return;
}
-out:
- ceph_fscache_readpages_cancel(inode, page_list);
-
- dout("readpages %p file %p ret %d\n", inode, file, rc);
- return rc;
+ netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
}
struct ceph_writeback_ctl
@@ -585,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
WARN_ON(!found);
}
- if (end > page_offset(page) + PAGE_SIZE)
- end = page_offset(page) + PAGE_SIZE;
+ if (end > page_offset(page) + thp_size(page))
+ end = page_offset(page) + thp_size(page);
return end > start ? end - start : 0;
}
@@ -604,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
int err;
- loff_t len = PAGE_SIZE;
+ loff_t len = thp_size(page);
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
@@ -632,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+ page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
return 0;
}
@@ -658,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
/* it may be a short write due to an object boundary */
- WARN_ON_ONCE(len > PAGE_SIZE);
+ WARN_ON_ONCE(len > thp_size(page));
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
@@ -667,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (!err)
err = ceph_osdc_wait_request(osdc, req);
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, err);
ceph_osdc_put_request(req);
@@ -695,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage cleaned page %p\n", page);
err = 0; /* vfs expects us to return 0 */
}
- page->private = 0;
- ClearPagePrivate(page);
+ oldest = detach_page_private(page);
+ WARN_ON_ONCE(oldest != snapc);
end_page_writeback(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
@@ -755,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_clear_error_write(ci);
}
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, rc);
/*
@@ -788,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req)
clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
- ceph_put_snap_context(page_snap_context(page));
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %p\n", page);
+ ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
+ dout("unlocking %p\n", page);
if (remove_page)
generic_error_remove_page(inode->i_mapping,
@@ -949,7 +839,7 @@ get_more_pages:
page_offset(page) >= i_size_read(inode)) &&
clear_page_dirty_for_io(page))
mapping->a_ops->invalidatepage(page,
- 0, PAGE_SIZE);
+ 0, thp_size(page));
unlock_page(page);
continue;
}
@@ -1038,7 +928,7 @@ get_more_pages:
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
- len += PAGE_SIZE;
+ len += thp_size(page);
}
/* did we get anything? */
@@ -1087,7 +977,7 @@ new_request:
BUG_ON(IS_ERR(req));
}
BUG_ON(len < page_offset(pages[locked_pages - 1]) +
- PAGE_SIZE - offset);
+ thp_size(page) - offset);
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1117,7 +1007,7 @@ new_request:
}
set_page_writeback(pages[i]);
- len += PAGE_SIZE;
+ len += thp_size(page);
}
if (ceph_wbc.size_stable) {
@@ -1126,7 +1016,7 @@ new_request:
/* writepages_finish() clears writeback pages
* according to the data length, so make sure
* data length covers all locked pages */
- u64 min_len = len + 1 - PAGE_SIZE;
+ u64 min_len = len + 1 - thp_size(page);
len = get_writepages_data_length(inode, pages[i - 1],
offset);
len = max(len, min_len);
@@ -1302,6 +1192,31 @@ ceph_find_incompatible(struct page *page)
return NULL;
}
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct page *page, void **_fsdata)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+
+ snapc = ceph_find_incompatible(page);
+ if (snapc) {
+ int r;
+
+ unlock_page(page);
+ put_page(page);
+ if (IS_ERR(snapc))
+ return PTR_ERR(snapc);
+
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ return r == 0 ? -EAGAIN : r;
+ }
+ return 0;
+}
+
/*
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
@@ -1312,75 +1227,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_snap_context *snapc;
struct page *page = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
- int pos_in_page = pos & ~PAGE_MASK;
- int r = 0;
+ int r;
- dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
-
- for (;;) {
+ /*
+ * Uninlining should have already been done and everything updated, EXCEPT
+ * for inline_version sent to the MDS.
+ */
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
- r = -ENOMEM;
- break;
- }
-
- snapc = ceph_find_incompatible(page);
- if (snapc) {
- if (IS_ERR(snapc)) {
- r = PTR_ERR(snapc);
- break;
- }
- unlock_page(page);
- put_page(page);
- page = NULL;
- ceph_queue_writeback(inode);
- r = wait_event_killable(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r != 0)
- break;
- continue;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- break;
- }
+ if (!page)
+ return -ENOMEM;
/*
- * In some cases we don't need to read at all:
- * - full page write
- * - write that lies completely beyond EOF
- * - write that covers the the page from start to EOF or beyond it
+ * The inline_version on a new inode is set to 1. If that's the
+ * case, then the page is brand new and isn't yet Uptodate.
*/
- if ((pos_in_page == 0 && len == PAGE_SIZE) ||
- (pos >= i_size_read(inode)) ||
- (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
- zero_user_segments(page, 0, pos_in_page,
- pos_in_page + len, PAGE_SIZE);
- break;
+ r = 0;
+ if (index == 0 && ci->i_inline_version != 1) {
+ if (!PageUptodate(page)) {
+ WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
+ ci->i_inline_version);
+ r = -EINVAL;
+ }
+ goto out;
}
-
- /*
- * We need to read it. If we get back -EINPROGRESS, then the page was
- * handed off to fscache and it will be unlocked when the read completes.
- * Refind the page in that case so we can reacquire the page lock. Otherwise
- * we got a hard error or the read was completed synchronously.
- */
- r = ceph_do_readpage(file, page);
- if (r != -EINPROGRESS)
- break;
+ zero_user_segment(page, 0, thp_size(page));
+ SetPageUptodate(page);
+ goto out;
}
+ r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
+ &ceph_netfs_read_ops, NULL);
+out:
+ if (r == 0)
+ wait_on_page_fscache(page);
if (r < 0) {
- if (page) {
- unlock_page(page);
+ if (page)
put_page(page);
- }
} else {
+ WARN_ON_ONCE(!PageLocked(page));
*pagep = page;
}
return r;
@@ -1438,7 +1325,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations ceph_aops = {
.readpage = ceph_readpage,
- .readpages = ceph_readpages,
+ .readahead = ceph_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
@@ -1470,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
- struct page *pinned_page = NULL;
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
@@ -1478,21 +1364,20 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_block_sigs(&oldset);
- dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
- inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
+ dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
+ inode, ceph_vinop(inode), off);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
got = 0;
- err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
- &got, &pinned_page);
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
if (err < 0)
goto out_restore;
- dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
- inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
+ dout("filemap_fault %p %llu got cap refs on %s\n",
+ inode, off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
ci->i_inline_version == CEPH_INLINE_NONE) {
@@ -1500,14 +1385,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
ceph_del_rw_context(fi, &rw_ctx);
- dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n",
- inode, off, (size_t)PAGE_SIZE,
- ceph_cap_string(got), ret);
+ dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
+ inode, off, ceph_cap_string(got), ret);
} else
err = -EAGAIN;
- if (pinned_page)
- put_page(pinned_page);
ceph_put_cap_refs(ci, got);
if (err != -EAGAIN)
@@ -1542,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
- dout("filemap_fault %p %llu~%zd read inline data ret %x\n",
- inode, off, (size_t)PAGE_SIZE, ret);
+ dout("filemap_fault %p %llu read inline data ret %x\n",
+ inode, off, ret);
}
out_restore:
ceph_restore_sigs(&oldset);
@@ -1553,9 +1435,6 @@ out_restore:
return ret;
}
-/*
- * Reuse write_begin here for simplicity.
- */
static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1591,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
goto out_free;
}
- if (off + PAGE_SIZE <= size)
- len = PAGE_SIZE;
+ if (off + thp_size(page) <= size)
+ len = thp_size(page);
else
- len = size & ~PAGE_MASK;
+ len = offset_in_thp(page, size);
dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
inode, ceph_vinop(inode), off, len, size);
@@ -1604,8 +1483,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER;
got = 0;
- err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
- &got, NULL);
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
if (err < 0)
goto out_free;
@@ -1832,7 +1710,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, err);
out_put:
@@ -2057,6 +1935,10 @@ int ceph_pool_perm_check(struct inode *inode, int need)
s64 pool;
int ret, flags;
+ /* Only need to do this for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
if (ci->i_vino.snap != CEPH_NOSNAP) {
/*
* Pool permission check needs to write to the first object.
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 2f5cb6bc78e1..9cfadbb86568 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
ci->fscache = NULL;
- fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
fscache_relinquish_cookie(cookie, &ci->i_vino, false);
}
@@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
dout("fscache_file_set_cookie %p %p disabling cache\n",
inode, filp);
fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
- fscache_uncache_all_inode_pages(ci->fscache, inode);
} else {
fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
ceph_fscache_can_enable, inode);
@@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
}
}
-static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
-{
- if (!error)
- SetPageUptodate(page);
-
- unlock_page(page);
-}
-
-static inline bool cache_valid(struct ceph_inode_info *ci)
-{
- return ci->i_fscache_gen == ci->i_rdcache_gen;
-}
-
-
-/* Atempt to read from the fscache,
- *
- * This function is called from the readpage_nounlock context. DO NOT attempt to
- * unlock the page here (or in the callback).
- */
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
-
- if (!cache_valid(ci))
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_page(ci->fscache, page,
- ceph_readpage_from_fscache_complete, NULL,
- GFP_KERNEL);
-
- switch (ret) {
- case 0: /* Page found */
- dout("page read submitted\n");
- return 0;
- case -ENOBUFS: /* Pages were not found, and can't be */
- case -ENODATA: /* Pages were not found */
- dout("page/inode not in cache\n");
- return ret;
- default:
- dout("%s: unknown error ret = %i\n", __func__, ret);
- return ret;
- }
-}
-
-int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
-
- if (!cache_valid(ci))
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
- ceph_readpage_from_fscache_complete,
- NULL, mapping_gfp_mask(mapping));
-
- switch (ret) {
- case 0: /* All pages found */
- dout("all-page read submitted\n");
- return 0;
- case -ENOBUFS: /* Some pages were not found, and can't be */
- case -ENODATA: /* some pages were not found */
- dout("page/inode not in cache\n");
- return ret;
- default:
- dout("%s: unknown error ret = %i\n", __func__, ret);
- return ret;
- }
-}
-
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
-
- if (!PageFsCache(page))
- return;
-
- if (!cache_valid(ci))
- return;
-
- ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
- GFP_KERNEL);
- if (ret)
- fscache_uncache_page(ci->fscache, page);
-}
-
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- if (!PageFsCache(page))
- return;
-
- fscache_wait_on_page_write(ci->fscache, page);
- fscache_uncache_page(ci->fscache, page);
-}
-
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
if (fscache_cookie_valid(fsc->fscache)) {
@@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
}
fsc->fscache = NULL;
}
-
-/*
- * caller should hold CEPH_CAP_FILE_{RD,CACHE}
- */
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
- if (cache_valid(ci))
- return;
-
- /* resue i_truncate_mutex. There should be no pending
- * truncate while the caller holds CEPH_CAP_FILE_RD */
- mutex_lock(&ci->i_truncate_mutex);
- if (!cache_valid(ci)) {
- if (fscache_check_consistency(ci->fscache, &ci->i_vino))
- fscache_invalidate(ci->fscache);
- spin_lock(&ci->i_ceph_lock);
- ci->i_fscache_gen = ci->i_rdcache_gen;
- spin_unlock(&ci->i_ceph_lock);
- }
- mutex_unlock(&ci->i_truncate_mutex);
-}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 89dbdd1eb14a..1409d6149281 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -9,6 +9,8 @@
#ifndef _CEPH_CACHE_H
#define _CEPH_CACHE_H
+#include <linux/netfs.h>
+
#ifdef CONFIG_CEPH_FSCACHE
extern struct fscache_netfs ceph_cache_netfs;
@@ -29,54 +31,37 @@ int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages);
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
ci->fscache = NULL;
- ci->i_fscache_gen = 0;
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- fscache_invalidate(ceph_inode(inode)->fscache);
+ return ci->fscache;
}
-static inline void ceph_fscache_uncache_page(struct inode *inode,
- struct page *page)
+static inline void ceph_fscache_invalidate(struct inode *inode)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_uncache_page(ci->fscache, page);
+ fscache_invalidate(ceph_inode(inode)->fscache);
}
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
- struct inode* inode = page->mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_maybe_release_page(ci->fscache, page, gfp);
-}
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode));
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
- struct page *page)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
- __fscache_uncache_page(ci->fscache, page);
+ if (!cookie)
+ return false;
+ return fscache_cookie_enabled(cookie);
}
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_readpages_cancel(ci->fscache, pages);
-}
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
-{
- ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+ return fscache_begin_read_operation(rreq, cookie);
}
-
#else
static inline int ceph_fscache_register(void)
@@ -102,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
}
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+ return NULL;
+}
+
static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
{
}
@@ -115,62 +105,19 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode,
{
}
-static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
-}
-
-static inline void ceph_fscache_uncache_page(struct inode *inode,
- struct page *pages)
-{
-}
-
-static inline int ceph_readpage_from_fscache(struct inode* inode,
- struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return -ENOBUFS;
-}
-
-static inline void ceph_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{
-}
-
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
-static inline void ceph_invalidate_fscache_page(struct inode *inode,
- struct page *page)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
+ return false;
}
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
-{
- return 1;
-}
-
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
- struct page *page)
-{
-}
-
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
-{
-}
-
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
{
+ return -ENOBUFS;
}
-
#endif
-#endif
+#endif /* _CEPH_CACHE_H */
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c03fa37cac4..a5e93b185515 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->flush_tid = flush_tid;
arg->oldest_flush_tid = oldest_flush_tid;
- arg->size = inode->i_size;
+ arg->size = i_size_read(inode);
ci->i_reported_size = arg->size;
arg->max_size = ci->i_wanted_max_size;
if (cap == ci->i_auth_cap) {
@@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
@@ -1884,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = ci->vfs_inode.i_size;
+ loff_t size = i_size_read(&ci->vfs_inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -2730,10 +2731,6 @@ again:
*got = need | want;
else
*got = need;
- if (S_ISREG(inode->i_mode) &&
- (need & CEPH_CAP_FILE_RD) &&
- !(*got & CEPH_CAP_FILE_CACHE))
- ceph_disable_fscache_readpage(ci);
ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
@@ -2858,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
* due to a small max_size, make sure we check_max_size (and possibly
* ask the mds) so we don't get hung up indefinitely.
*/
-int ceph_get_caps(struct file *filp, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page)
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
{
struct ceph_file_info *fi = filp->private_data;
struct inode *inode = file_inode(filp);
@@ -2957,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
struct page *page =
find_get_page(inode->i_mapping, 0);
if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- break;
- }
+ bool uptodate = PageUptodate(page);
+
put_page(page);
+ if (uptodate)
+ break;
}
/*
* drop cap refs first because getattr while
@@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
break;
}
-
- if (S_ISREG(ci->vfs_inode.i_mode) &&
- (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
- ceph_fscache_revalidate_cookie(ci);
-
*got = _got;
return 0;
}
@@ -3308,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode,
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- inode->i_size);
+ i_size_read(inode));
/*
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 66989c880adb..425f3356332a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p)
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
seq_printf(s, "-----------------------------------------------------------------------------------\n");
- spin_lock(&m->read_latency_lock);
+ spin_lock(&m->read_metric_lock);
total = m->total_reads;
sum = m->read_latency_sum;
avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
min = m->read_latency_min;
max = m->read_latency_max;
sq = m->read_latency_sq_sum;
- spin_unlock(&m->read_latency_lock);
+ spin_unlock(&m->read_metric_lock);
CEPH_METRIC_SHOW("read", total, avg, min, max, sq);
- spin_lock(&m->write_latency_lock);
+ spin_lock(&m->write_metric_lock);
total = m->total_writes;
sum = m->write_latency_sum;
avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
min = m->write_latency_min;
max = m->write_latency_max;
sq = m->write_latency_sq_sum;
- spin_unlock(&m->write_latency_lock);
+ spin_unlock(&m->write_metric_lock);
CEPH_METRIC_SHOW("write", total, avg, min, max, sq);
- spin_lock(&m->metadata_latency_lock);
+ spin_lock(&m->metadata_metric_lock);
total = m->total_metadatas;
sum = m->metadata_latency_sum;
avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
min = m->metadata_latency_min;
max = m->metadata_latency_max;
sq = m->metadata_latency_sq_sum;
- spin_unlock(&m->metadata_latency_lock);
+ spin_unlock(&m->metadata_metric_lock);
CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq);
seq_printf(s, "\n");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f7a790ed62c4..5624fae7a603 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
+ break;
case SEEK_SET:
break;
case SEEK_END:
retval = -EOPNOTSUPP;
+ goto out;
default:
goto out;
}
@@ -665,8 +667,8 @@ out:
/*
* Handle lookups for the hidden .snap directory.
*/
-int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
+struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry, int err)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
@@ -674,18 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
/* .snap dir? */
if (err == -ENOENT &&
ceph_snap(parent) == CEPH_NOSNAP &&
- strcmp(dentry->d_name.name,
- fsc->mount_options->snapdir_name) == 0) {
+ strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
+ struct dentry *res;
struct inode *inode = ceph_get_snapdir(parent);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
- dentry, dentry, inode);
- BUG_ON(!d_unhashed(dentry));
- d_add(dentry, inode);
- err = 0;
+
+ res = d_splice_alias(inode, dentry);
+ dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
+ dentry, dentry, inode, res);
+ if (res)
+ dentry = res;
}
- return err;
+ return dentry;
}
/*
@@ -741,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
+ struct dentry *res;
int op;
int mask;
int err;
@@ -791,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
- err = ceph_handle_snapdir(req, dentry, err);
+ res = ceph_handle_snapdir(req, dentry, err);
+ if (IS_ERR(res)) {
+ err = PTR_ERR(res);
+ } else {
+ dentry = res;
+ err = 0;
+ }
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
dout("lookup result=%p\n", dentry);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index f22156ee7306..65540a4429b2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
vino.ino = ino;
vino.snap = CEPH_NOSNAP;
+
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-ESTALE);
+
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
@@ -178,8 +182,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_CAST(inode);
/* We need LINK caps to reliably check i_nlink */
err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
- if (err)
+ if (err) {
+ iput(inode);
return ERR_PTR(err);
+ }
/* -ESTALE if inode as been unlinked and no file is open */
if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
iput(inode);
@@ -212,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
vino.ino = sfh->ino;
vino.snap = sfh->snapid;
}
+
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-ESTALE);
+
inode = ceph_find_inode(sb, vino);
if (inode)
return d_obtain_alias(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 209535d5b8d3..77fc037d5beb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -739,9 +739,12 @@ retry:
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
- err = ceph_handle_snapdir(req, dentry, err);
- if (err)
+ dentry = ceph_handle_snapdir(req, dentry, err);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
goto out_req;
+ }
+ err = 0;
if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -892,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
if (!ret)
ret = ceph_osdc_wait_request(osdc, req);
- ceph_update_read_latency(&fsc->mdsc->metric,
+ ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
req->r_end_latency,
ret);
@@ -1034,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
dout("ceph_aio_complete_req %p rc %d bytes %u\n",
inode, rc, osd_data->bvec_pos.iter.bi_size);
- /* r_start_latency == 0 means the request was not submitted */
- if (req->r_start_latency) {
- if (aio_req->write)
- ceph_update_write_latency(metric, req->r_start_latency,
- req->r_end_latency, rc);
- else
- ceph_update_read_latency(metric, req->r_start_latency,
- req->r_end_latency, rc);
- }
-
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
BUG_ON(!aio_req->write);
@@ -1086,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
}
+ /* r_start_latency == 0 means the request was not submitted */
+ if (req->r_start_latency) {
+ if (aio_req->write)
+ ceph_update_write_metrics(metric, req->r_start_latency,
+ req->r_end_latency, rc);
+ else
+ ceph_update_read_metrics(metric, req->r_start_latency,
+ req->r_end_latency, rc);
+ }
+
put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
aio_req->should_dirty);
ceph_osdc_put_request(req);
@@ -1290,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (write)
- ceph_update_write_latency(metric, req->r_start_latency,
+ ceph_update_write_metrics(metric, req->r_start_latency,
req->r_end_latency, ret);
else
- ceph_update_read_latency(metric, req->r_start_latency,
+ ceph_update_read_metrics(metric, req->r_start_latency,
req->r_end_latency, ret);
size = i_size_read(inode);
@@ -1467,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, ret);
out:
ceph_osdc_put_request(req);
@@ -1510,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
size_t len = iov_iter_count(to);
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *pinned_page = NULL;
bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
int want, got = 0;
@@ -1529,8 +1531,7 @@ again:
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
- ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
- &got, &pinned_page);
+ ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
if (ret < 0) {
if (iocb->ki_flags & IOCB_DIRECT)
ceph_end_io_direct(inode);
@@ -1571,10 +1572,6 @@ again:
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
- if (pinned_page) {
- put_page(pinned_page);
- pinned_page = NULL;
- }
ceph_put_cap_refs(ci, got);
if (direct_lock)
@@ -1753,8 +1750,7 @@ retry_snap:
else
want = CEPH_CAP_FILE_BUFFER;
got = 0;
- err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
- &got, NULL);
+ err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
if (err < 0)
goto out;
@@ -2083,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode,
else
want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+ ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
if (ret < 0)
goto unlock;
@@ -2121,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got,
retry_caps:
ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
- dst_endoff, dst_got, NULL);
+ dst_endoff, dst_got);
if (ret < 0)
return ret;
@@ -2143,7 +2139,7 @@ retry_caps:
return ret;
}
ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
- CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
+ CEPH_CAP_FILE_SHARED, -1, src_got);
if (ret < 0)
return ret;
/*... drop src_ci caps too, and retry */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 689e3ffd29d7..e1c63adb196d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-EREMOTEIO);
+
inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
ceph_set_ino_cb, &vino);
if (!inode)
@@ -99,14 +102,15 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mtime = parent->i_mtime;
inode->i_ctime = parent->i_ctime;
inode->i_atime = parent->i_atime;
- inode->i_op = &ceph_snapdir_iops;
- inode->i_fop = &ceph_snapdir_fops;
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
- if (inode->i_state & I_NEW)
+ if (inode->i_state & I_NEW) {
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
unlock_new_inode(inode);
+ }
return inode;
}
@@ -628,10 +632,11 @@ int ceph_fill_file_size(struct inode *inode, int issued,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
+ loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
+ (truncate_seq == ci->i_truncate_seq && size > isize)) {
+ dout("size %lld -> %llu\n", isize, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
pr_err("fill_file_size non-zero size for directory\n");
size = 0;
@@ -925,6 +930,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
ci->i_dir_pin = iinfo->dir_pin;
+ ci->i_rsnaps = iinfo->rsnaps;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -1818,7 +1824,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+ dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
@@ -1894,6 +1900,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_pages %p fails\n", inode);
}
@@ -2124,20 +2131,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
}
if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
+ loff_t isize = i_size_read(inode);
+
+ dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+ if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
ia_valid |= ATTR_MTIME;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
+ attr->ia_size != isize) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2247,7 +2253,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size > max(inode->i_size, fsc->max_file_size))
+ attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
return -EFBIG;
if ((attr->ia_valid & ATTR_SIZE) &&
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
index 97602ea92ff4..c456509b31c3 100644
--- a/fs/ceph/io.c
+++ b/fs/ceph/io.c
@@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
}
/**
- * ceph_end_io_direct - declare the file is being used for direct i/o
+ * ceph_start_io_direct - declare the file is being used for direct i/o
* @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d87bd852ed96..e5af591d3bd4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end,
memset(&info->snap_btime, 0, sizeof(info->snap_btime));
}
+ /* snapshot count, remains zero for v<=3 */
+ if (struct_v >= 4) {
+ ceph_decode_64_safe(p, end, info->rsnaps, bad);
+ } else {
+ info->rsnaps = 0;
+ }
+
*p = end;
} else {
if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end,
}
info->dir_pin = -ENODATA;
- /* info->snap_btime remains zero */
+ /* info->snap_btime and info->rsnaps remain zero */
}
return 0;
bad:
@@ -433,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end,
ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad);
+
+ /* Don't accept a delegation of system inodes */
+ if (start < CEPH_INO_SYSTEM_BASE) {
+ pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+ start, len);
+ continue;
+ }
while (len--) {
int err = xa_insert(&s->s_delegated_inos, ino = start++,
DELEGATED_INO_AVAILABLE,
@@ -3306,7 +3320,7 @@ out_err:
/* kick calling process */
complete_request(mdsc, req);
- ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency,
+ ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
req->r_end_latency, err);
out:
ceph_mdsc_put_request(req);
@@ -3780,7 +3794,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v1.issued = cpu_to_le32(cap->issued);
- rec.v1.size = cpu_to_le64(inode->i_size);
+ rec.v1.size = cpu_to_le64(i_size_read(inode));
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index eaa7c5422116..15c11a0f2caf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in {
s32 dir_pin;
struct ceph_timespec btime;
struct ceph_timespec snap_btime;
+ u64 rsnaps;
u64 change_attr;
};
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 5ec94bd4c1de..28b6b42ad677 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_metric_write_latency *write;
struct ceph_metric_metadata_latency *meta;
struct ceph_metric_dlease *dlease;
+ struct ceph_opened_files *files;
+ struct ceph_pinned_icaps *icaps;
+ struct ceph_opened_inodes *inodes;
struct ceph_client_metric *m = &mdsc->metric;
u64 nr_caps = atomic64_read(&m->total_caps);
struct ceph_msg *msg;
@@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
s32 len;
len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
- + sizeof(*meta) + sizeof(*dlease);
+ + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
+ + sizeof(*icaps) + sizeof(*inodes);
msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
if (!msg) {
@@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
items++;
+ sum = percpu_counter_sum(&m->total_inodes);
+
+ /* encode the opened files metric */
+ files = (struct ceph_opened_files *)(dlease + 1);
+ files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+ files->ver = 1;
+ files->compat = 1;
+ files->data_len = cpu_to_le32(sizeof(*files) - 10);
+ files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
+ files->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the pinned icaps metric */
+ icaps = (struct ceph_pinned_icaps *)(files + 1);
+ icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+ icaps->ver = 1;
+ icaps->compat = 1;
+ icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10);
+ icaps->pinned_icaps = cpu_to_le64(nr_caps);
+ icaps->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the opened inodes metric */
+ inodes = (struct ceph_opened_inodes *)(icaps + 1);
+ inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+ inodes->ver = 1;
+ inodes->compat = 1;
+ inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10);
+ inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
+ inodes->total = cpu_to_le64(sum);
+ items++;
+
put_unaligned_le32(items, &head->num);
msg->front.iov_len = len;
msg->hdr.version = cpu_to_le16(1);
@@ -183,21 +219,21 @@ int ceph_metric_init(struct ceph_client_metric *m)
if (ret)
goto err_i_caps_mis;
- spin_lock_init(&m->read_latency_lock);
+ spin_lock_init(&m->read_metric_lock);
m->read_latency_sq_sum = 0;
m->read_latency_min = KTIME_MAX;
m->read_latency_max = 0;
m->total_reads = 0;
m->read_latency_sum = 0;
- spin_lock_init(&m->write_latency_lock);
+ spin_lock_init(&m->write_metric_lock);
m->write_latency_sq_sum = 0;
m->write_latency_min = KTIME_MAX;
m->write_latency_max = 0;
m->total_writes = 0;
m->write_latency_sum = 0;
- spin_lock_init(&m->metadata_latency_lock);
+ spin_lock_init(&m->metadata_metric_lock);
m->metadata_latency_sq_sum = 0;
m->metadata_latency_min = KTIME_MAX;
m->metadata_latency_max = 0;
@@ -274,7 +310,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
*sq_sump += sq;
}
-void ceph_update_read_latency(struct ceph_client_metric *m,
+void ceph_update_read_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc)
{
@@ -283,14 +319,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m,
if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
return;
- spin_lock(&m->read_latency_lock);
+ spin_lock(&m->read_metric_lock);
__update_latency(&m->total_reads, &m->read_latency_sum,
&m->read_latency_min, &m->read_latency_max,
&m->read_latency_sq_sum, lat);
- spin_unlock(&m->read_latency_lock);
+ spin_unlock(&m->read_metric_lock);
}
-void ceph_update_write_latency(struct ceph_client_metric *m,
+void ceph_update_write_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc)
{
@@ -299,14 +335,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m,
if (unlikely(rc && rc != -ETIMEDOUT))
return;
- spin_lock(&m->write_latency_lock);
+ spin_lock(&m->write_metric_lock);
__update_latency(&m->total_writes, &m->write_latency_sum,
&m->write_latency_min, &m->write_latency_max,
&m->write_latency_sq_sum, lat);
- spin_unlock(&m->write_latency_lock);
+ spin_unlock(&m->write_metric_lock);
}
-void ceph_update_metadata_latency(struct ceph_client_metric *m,
+void ceph_update_metadata_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc)
{
@@ -315,9 +351,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m,
if (unlikely(rc && rc != -ENOENT))
return;
- spin_lock(&m->metadata_latency_lock);
+ spin_lock(&m->metadata_metric_lock);
__update_latency(&m->total_metadatas, &m->metadata_latency_sum,
&m->metadata_latency_min, &m->metadata_latency_max,
&m->metadata_latency_sq_sum, lat);
- spin_unlock(&m->metadata_latency_lock);
+ spin_unlock(&m->metadata_metric_lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index af6038ff39d4..e984eb2bb14b 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -14,8 +14,11 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_WRITE_LATENCY,
CLIENT_METRIC_TYPE_METADATA_LATENCY,
CLIENT_METRIC_TYPE_DENTRY_LEASE,
+ CLIENT_METRIC_TYPE_OPENED_FILES,
+ CLIENT_METRIC_TYPE_PINNED_ICAPS,
+ CLIENT_METRIC_TYPE_OPENED_INODES,
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES,
};
/*
@@ -28,6 +31,9 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_METADATA_LATENCY, \
CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
\
CLIENT_METRIC_TYPE_MAX, \
}
@@ -94,6 +100,42 @@ struct ceph_metric_dlease {
__le64 total;
} __packed;
+/* metric opened files header */
+struct ceph_opened_files {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(opened_files + total) */
+ __le64 opened_files;
+ __le64 total;
+} __packed;
+
+/* metric pinned i_caps header */
+struct ceph_pinned_icaps {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(pinned_icaps + total) */
+ __le64 pinned_icaps;
+ __le64 total;
+} __packed;
+
+/* metric opened inodes header */
+struct ceph_opened_inodes {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(opened_inodes + total) */
+ __le64 opened_inodes;
+ __le64 total;
+} __packed;
+
struct ceph_metric_head {
__le32 num; /* the number of metrics that will be sent */
} __packed;
@@ -108,21 +150,21 @@ struct ceph_client_metric {
struct percpu_counter i_caps_hit;
struct percpu_counter i_caps_mis;
- spinlock_t read_latency_lock;
+ spinlock_t read_metric_lock;
u64 total_reads;
ktime_t read_latency_sum;
ktime_t read_latency_sq_sum;
ktime_t read_latency_min;
ktime_t read_latency_max;
- spinlock_t write_latency_lock;
+ spinlock_t write_metric_lock;
u64 total_writes;
ktime_t write_latency_sum;
ktime_t write_latency_sq_sum;
ktime_t write_latency_min;
ktime_t write_latency_max;
- spinlock_t metadata_latency_lock;
+ spinlock_t metadata_metric_lock;
u64 total_metadatas;
ktime_t metadata_latency_sum;
ktime_t metadata_latency_sq_sum;
@@ -162,13 +204,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
percpu_counter_inc(&m->i_caps_mis);
}
-extern void ceph_update_read_latency(struct ceph_client_metric *m,
+extern void ceph_update_read_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc);
-extern void ceph_update_write_latency(struct ceph_client_metric *m,
+extern void ceph_update_write_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc);
-extern void ceph_update_metadata_latency(struct ceph_client_metric *m,
+extern void ceph_update_metadata_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc);
#endif /* _FS_CEPH_MDS_METRIC_H */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0728b01d4d43..4ce18055d931 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
- capsnap->size = inode->i_size;
+ capsnap->size = i_size_read(inode);
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
capsnap->ctime = inode->i_ctime;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c48bb30c8d70..db80d89556b1 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,6 +21,7 @@
#include <linux/ceph/libceph.h>
#ifdef CONFIG_CEPH_FSCACHE
+#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
#endif
@@ -333,7 +334,7 @@ struct ceph_inode_info {
/* for dirs */
struct timespec64 i_rctime;
- u64 i_rbytes, i_rfiles, i_rsubdirs;
+ u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps;
u64 i_files, i_subdirs;
/* quotas */
@@ -427,7 +428,6 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
- u32 i_fscache_gen;
#endif
errseq_t i_meta_err;
@@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+/*
+ * The MDS reserves a set of inodes for its own usage. These should never
+ * be accessible by clients, and so the MDS has no reason to ever hand these
+ * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE.
+ *
+ * These come from src/mds/mdstypes.h in the ceph sources.
+ */
+#define CEPH_MAX_MDS 0x100
+#define CEPH_NUM_STRAY 10
+#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS)
+#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
+
+static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
+{
+ if (vino.ino < CEPH_INO_SYSTEM_BASE &&
+ vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) {
+ WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino);
+ return true;
+ }
+ return false;
+}
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
+ if (ceph_vino_is_reserved(vino))
+ return NULL;
+
/*
* NB: The hashval will be run through the fs/inode.c hash function
* anyway, so there is no need to squash the inode number down to
@@ -1156,7 +1180,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
int mds, int drop, int unless);
extern int ceph_get_caps(struct file *filp, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page);
+ loff_t endoff, int *got);
extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
@@ -1193,7 +1217,7 @@ extern const struct dentry_operations ceph_dentry_ops;
extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err);
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 02f59bcb4f27..1242db8d3444 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
}
+static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps);
+}
+
static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
size_t size)
{
@@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rentries),
XATTR_RSTAT_FIELD(dir, rfiles),
XATTR_RSTAT_FIELD(dir, rsubdirs),
+ XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime),
{
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 2a5325a7ae49..9c45b3a82ad9 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -55,6 +55,7 @@
#define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */
#define CIFS_MOUNT_RO_CACHE 0x20000000 /* assumes share will not change */
#define CIFS_MOUNT_RW_CACHE 0x40000000 /* assumes only client accessing */
+#define CIFS_MOUNT_SHUTDOWN 0x80000000
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 153d5c842a9b..37fc7d6ac457 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -57,6 +57,12 @@ struct smb_query_info {
/* char buffer[]; */
} __packed;
+/*
+ * Dumping the commonly used 16 byte (e.g. CCM and GCM128) keys still supported
+ * for backlevel compatibility, but is not sufficient for dumping the less
+ * frequently used GCM256 (32 byte) keys (see the newer "CIFS_DUMP_FULL_KEY"
+ * ioctl for dumping decryption info for GCM256 mounts)
+ */
struct smb3_key_debug_info {
__u64 Suid;
__u16 cipher_type;
@@ -65,6 +71,31 @@ struct smb3_key_debug_info {
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
} __packed;
+/*
+ * Dump variable-sized keys
+ */
+struct smb3_full_key_debug_info {
+ /* INPUT: size of userspace buffer */
+ __u32 in_size;
+
+ /*
+ * INPUT: 0 for current user, otherwise session to dump
+ * OUTPUT: session id that was dumped
+ */
+ __u64 session_id;
+ __u16 cipher_type;
+ __u8 session_key_length;
+ __u8 server_in_key_length;
+ __u8 server_out_key_length;
+ __u8 data[];
+ /*
+ * return this struct with the keys appended at the end:
+ * __u8 session_key[session_key_length];
+ * __u8 server_in_key[server_in_key_length];
+ * __u8 server_out_key[server_out_key_length];
+ */
+} __packed;
+
struct smb3_notify {
__u32 completion_filter;
bool watch_tree;
@@ -78,3 +109,20 @@ struct smb3_notify {
#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
+#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
+#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+
+/*
+ * Flags for going down operation
+ */
+#define CIFS_GOING_FLAGS_DEFAULT 0x0 /* going down */
+#define CIFS_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
+#define CIFS_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+
+static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi)
+{
+ if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags)
+ return true;
+ else
+ return false;
+}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5f2c139143a7..2ffcb29d5c8f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -75,7 +75,7 @@ bool enable_oplocks = true;
bool linuxExtEnabled = true;
bool lookupCacheEnabled = true;
bool disable_legacy_dialects; /* false by default */
-bool enable_gcm_256; /* false by default, change when more servers support it */
+bool enable_gcm_256 = true;
bool require_gcm_256; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
@@ -133,6 +133,7 @@ struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
+struct workqueue_struct *deferredclose_wq;
__u32 cifs_lock_secret;
/*
@@ -390,6 +391,8 @@ cifs_alloc_inode(struct super_block *sb)
/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
INIT_LIST_HEAD(&cifs_inode->openFileList);
INIT_LIST_HEAD(&cifs_inode->llist);
+ INIT_LIST_HEAD(&cifs_inode->deferred_closes);
+ spin_lock_init(&cifs_inode->deferred_lock);
return &cifs_inode->vfs_inode;
}
@@ -860,13 +863,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
goto out;
}
- /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */
- kfree(cifs_sb->ctx->UNC);
- cifs_sb->ctx->UNC = NULL;
- kfree(cifs_sb->ctx->prepath);
- cifs_sb->ctx->prepath = NULL;
-
- rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
+ rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL);
if (rc) {
root = ERR_PTR(rc);
goto out;
@@ -1637,9 +1634,16 @@ init_cifs(void)
goto out_destroy_fileinfo_put_wq;
}
+ deferredclose_wq = alloc_workqueue("deferredclose",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!deferredclose_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_cifsoplockd_wq;
+ }
+
rc = cifs_fscache_register();
if (rc)
- goto out_destroy_cifsoplockd_wq;
+ goto out_destroy_deferredclose_wq;
rc = cifs_init_inodecache();
if (rc)
@@ -1707,6 +1711,8 @@ out_destroy_inodecache:
cifs_destroy_inodecache();
out_unreg_fscache:
cifs_fscache_unregister();
+out_destroy_deferredclose_wq:
+ destroy_workqueue(deferredclose_wq);
out_destroy_cifsoplockd_wq:
destroy_workqueue(cifsoplockd_wq);
out_destroy_fileinfo_put_wq:
@@ -1741,6 +1747,7 @@ exit_cifs(void)
cifs_destroy_mids();
cifs_destroy_inodecache();
cifs_fscache_unregister();
+ destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
destroy_workqueue(fileinfo_put_wq);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b23a0ee8c6f8..8488d7024462 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1154,6 +1154,14 @@ struct cifs_pending_open {
__u32 oplock;
};
+struct cifs_deferred_close {
+ struct list_head dlist;
+ struct tcon_link *tlink;
+ __u16 netfid;
+ __u64 persistent_fid;
+ __u64 volatile_fid;
+};
+
/*
* This info hangs off the cifsFileInfo structure, pointed to by llist.
* This is used to track byte stream locks on the file
@@ -1248,6 +1256,8 @@ struct cifsFileInfo {
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
struct work_struct put; /* work for the final part of _put */
+ struct delayed_work deferred;
+ bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
};
struct cifs_io_parms {
@@ -1392,6 +1402,7 @@ struct cifsInodeInfo {
#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
+#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */
unsigned long flags;
spinlock_t writers_lock;
unsigned int writers; /* Number of writers on this inode */
@@ -1404,6 +1415,9 @@ struct cifsInodeInfo {
struct fscache_cookie *fscache;
#endif
struct inode vfs_inode;
+ struct list_head deferred_closes; /* list of deferred closes */
+ spinlock_t deferred_lock; /* protection on deferred list */
+ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
};
static inline struct cifsInodeInfo *
@@ -1871,11 +1885,14 @@ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
void cifs_oplock_break(struct work_struct *work);
void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
+void smb2_deferred_work_close(struct work_struct *work);
+extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
extern struct workqueue_struct *decrypt_wq;
extern struct workqueue_struct *fileinfo_put_wq;
extern struct workqueue_struct *cifsoplockd_wq;
+extern struct workqueue_struct *deferredclose_wq;
extern __u32 cifs_lock_secret;
extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b53a87db282f..554d64fe171e 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -148,7 +148,8 @@
#define SMB3_SIGN_KEY_SIZE (16)
/*
- * Size of the smb3 encryption/decryption keys
+ * Size of the smb3 encryption/decryption key storage.
+ * This size is big enough to store any cipher key types.
*/
#define SMB3_ENC_DEC_KEY_SIZE (32)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a79d50001fbf..d30cba44ba29 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -267,6 +267,19 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
struct tcon_link *tlink,
struct cifs_pending_open *open);
extern void cifs_del_pending_open(struct cifs_pending_open *open);
+
+extern bool cifs_is_deferred_close(struct cifsFileInfo *cfile,
+ struct cifs_deferred_close **dclose);
+
+extern void cifs_add_deferred_close(struct cifsFileInfo *cfile,
+ struct cifs_deferred_close *dclose);
+
+extern void cifs_del_deferred_close(struct cifsFileInfo *cfile);
+
+extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
+
+extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
+
extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 121d8b4535b0..495c395f9def 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -392,16 +392,6 @@ cifs_echo_request(struct work_struct *work)
int rc;
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, echo.work);
- unsigned long echo_interval;
-
- /*
- * If we need to renegotiate, set echo interval to zero to
- * immediately call echo service where we can renegotiate.
- */
- if (server->tcpStatus == CifsNeedNegotiate)
- echo_interval = 0;
- else
- echo_interval = server->echo_interval;
/*
* We cannot send an echo if it is disabled.
@@ -412,7 +402,7 @@ cifs_echo_request(struct work_struct *work)
server->tcpStatus == CifsExiting ||
server->tcpStatus == CifsNew ||
(server->ops->can_echo && !server->ops->can_echo(server)) ||
- time_before(jiffies, server->lstrp + echo_interval - HZ))
+ time_before(jiffies, server->lstrp + server->echo_interval - HZ))
goto requeue_echo;
rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -476,6 +466,7 @@ server_unresponsive(struct TCP_Server_Info *server)
*/
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
+ (!server->ops->can_echo || server->ops->can_echo(server)) &&
time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
@@ -3158,17 +3149,29 @@ out:
int
cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname)
{
- int rc = 0;
+ int rc;
- smb3_parse_devname(devname, ctx);
+ if (devname) {
+ cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname);
+ rc = smb3_parse_devname(devname, ctx);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc);
+ return rc;
+ }
+ }
if (mntopts) {
char *ip;
- cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts);
rc = smb3_parse_opt(mntopts, "ip", &ip);
- if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip,
- strlen(ip))) {
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc);
+ return rc;
+ }
+
+ rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip));
+ kfree(ip);
+ if (!rc) {
cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__);
return -EINVAL;
}
@@ -3188,7 +3191,7 @@ cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const c
return -EINVAL;
}
- return rc;
+ return 0;
}
static int
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index c85aff838305..6bcd3e8f7cda 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -34,6 +34,7 @@
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "fs_context.h"
+#include "cifs_ioctl.h"
static void
renew_parental_timestamps(struct dentry *direntry)
@@ -430,6 +431,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
__u32 oplock;
struct cifsFileInfo *file_info;
+ if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+ return -EIO;
+
/*
* Posix open is only called (at lookup time) for file create now. For
* opens (rather than creates), because we do not know if it is a file
@@ -546,6 +550,9 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
inode, direntry, direntry);
+ if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+ return -EIO;
+
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
if (IS_ERR(tlink))
@@ -583,6 +590,9 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
return -EINVAL;
cifs_sb = CIFS_SB(inode->i_sb);
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 639c59596d4f..379a427f3c2f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -45,6 +45,7 @@
#include "fscache.h"
#include "smbdirect.h"
#include "fs_context.h"
+#include "cifs_ioctl.h"
static inline int cifs_convert_flags(unsigned int flags)
{
@@ -322,9 +323,11 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->dentry = dget(dentry);
cfile->f_flags = file->f_flags;
cfile->invalidHandle = false;
+ cfile->deferred_close_scheduled = false;
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
INIT_WORK(&cfile->put, cifsFileInfo_put_work);
+ INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close);
mutex_init(&cfile->fh_mutex);
spin_lock_init(&cfile->file_info_lock);
@@ -539,6 +542,11 @@ int cifs_open(struct inode *inode, struct file *file)
xid = get_xid();
cifs_sb = CIFS_SB(inode->i_sb);
+ if (unlikely(cifs_forced_shutdown(cifs_sb))) {
+ free_xid(xid);
+ return -EIO;
+ }
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
free_xid(xid);
@@ -565,6 +573,20 @@ int cifs_open(struct inode *inode, struct file *file)
file->f_op = &cifs_file_direct_ops;
}
+ /* Get the cached handle as SMB2 close is deferred */
+ rc = cifs_get_readable_path(tcon, full_path, &cfile);
+ if (rc == 0) {
+ if (file->f_flags == cfile->f_flags) {
+ file->private_data = cfile;
+ spin_lock(&CIFS_I(inode)->deferred_lock);
+ cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(inode)->deferred_lock);
+ goto out;
+ } else {
+ _cifsFileInfo_put(cfile, true, false);
+ }
+ }
+
if (server->oplocks)
oplock = REQ_OPLOCK;
else
@@ -846,11 +868,59 @@ reopen_error_exit:
return rc;
}
+void smb2_deferred_work_close(struct work_struct *work)
+{
+ struct cifsFileInfo *cfile = container_of(work,
+ struct cifsFileInfo, deferred.work);
+
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ cifs_del_deferred_close(cfile);
+ cfile->deferred_close_scheduled = false;
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ _cifsFileInfo_put(cfile, true, false);
+}
+
int cifs_close(struct inode *inode, struct file *file)
{
+ struct cifsFileInfo *cfile;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_deferred_close *dclose;
+
if (file->private_data != NULL) {
- _cifsFileInfo_put(file->private_data, true, false);
+ cfile = file->private_data;
file->private_data = NULL;
+ dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
+ if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
+ cinode->lease_granted &&
+ dclose) {
+ if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags))
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ spin_lock(&cinode->deferred_lock);
+ cifs_add_deferred_close(cfile, dclose);
+ if (cfile->deferred_close_scheduled &&
+ delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq,
+ &cfile->deferred, cifs_sb->ctx->acregmax))
+ cifsFileInfo_get(cfile);
+ } else {
+ /* Deferred close for files */
+ queue_delayed_work(deferredclose_wq,
+ &cfile->deferred, cifs_sb->ctx->acregmax);
+ cfile->deferred_close_scheduled = true;
+ spin_unlock(&cinode->deferred_lock);
+ return 0;
+ }
+ spin_unlock(&cinode->deferred_lock);
+ _cifsFileInfo_put(cfile, true, false);
+ } else {
+ _cifsFileInfo_put(cfile, true, false);
+ kfree(dclose);
+ }
}
/* return code from the ->release op is always ignored */
@@ -1920,8 +1990,10 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
if (total_written > 0) {
spin_lock(&d_inode(dentry)->i_lock);
- if (*offset > d_inode(dentry)->i_size)
+ if (*offset > d_inode(dentry)->i_size) {
i_size_write(d_inode(dentry), *offset);
+ d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9;
+ }
spin_unlock(&d_inode(dentry)->i_lock);
}
mark_inode_dirty_sync(d_inode(dentry));
@@ -1947,7 +2019,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
- if (!open_file->invalidHandle) {
+ if ((!open_file->invalidHandle)) {
/* found a good file */
/* lock it so it will not be closed on us */
cifsFileInfo_get(open_file);
@@ -2476,6 +2548,8 @@ retry:
if (cfile)
cifsFileInfo_put(cfile);
free_xid(xid);
+ /* Indication to update ctime and mtime as close is deferred */
+ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
return rc;
}
@@ -2577,13 +2651,17 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
if (rc > 0) {
spin_lock(&inode->i_lock);
- if (pos > inode->i_size)
+ if (pos > inode->i_size) {
i_size_write(inode, pos);
+ inode->i_blocks = (512 - 1 + pos) >> 9;
+ }
spin_unlock(&inode->i_lock);
}
unlock_page(page);
put_page(page);
+ /* Indication to update ctime and mtime as close is deferred */
+ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
return rc;
}
@@ -4744,6 +4822,8 @@ void cifs_oplock_break(struct work_struct *work)
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
bool purge_cache = false;
+ bool is_deferred = false;
+ struct cifs_deferred_close *dclose;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -4790,6 +4870,24 @@ oplock_break_ack:
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ /*
+ * When oplock break is received and there are no active
+ * file handles but cached, then schedule deferred close immediately.
+ * So, new open will not use cached handle.
+ */
+ spin_lock(&CIFS_I(inode)->deferred_lock);
+ is_deferred = cifs_is_deferred_close(cfile, &dclose);
+ if (is_deferred &&
+ cfile->deferred_close_scheduled &&
+ delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
+ cifsFileInfo_get(cfile);
+ }
+ spin_unlock(&CIFS_I(inode)->deferred_lock);
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 3e0d016849e3..92d4ab029c91 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -476,6 +476,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
/* move "pos" up to delimiter or NULL */
pos += len;
+ kfree(ctx->UNC);
ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
if (!ctx->UNC)
return -ENOMEM;
@@ -486,6 +487,9 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (*pos == '/' || *pos == '\\')
pos++;
+ kfree(ctx->prepath);
+ ctx->prepath = NULL;
+
/* If pos is NULL then no prepath */
if (!*pos)
return 0;
@@ -1017,6 +1021,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
ctx->max_channels = result.uint_32;
+ /* If more than one channel requested ... they want multichan */
+ if (result.uint_32 > 1)
+ ctx->multichannel = true;
break;
case Opt_handletimeout:
ctx->handle_timeout = result.uint_32;
@@ -1138,7 +1145,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
/* if iocharset not set then load_nls_default
* is used by caller
*/
- cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset);
+ cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset);
break;
case Opt_netbiosname:
memset(ctx->source_rfc1001_name, 0x20,
@@ -1642,6 +1649,7 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb)
cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
}
}
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN;
return;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 002d864b8f7b..1dfa57982522 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -26,7 +26,6 @@
#include <linux/sched/signal.h>
#include <linux/wait_bit.h>
#include <linux/fiemap.h>
-
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -38,7 +37,7 @@
#include "cifs_unicode.h"
#include "fscache.h"
#include "fs_context.h"
-
+#include "cifs_ioctl.h"
static void cifs_set_ops(struct inode *inode)
{
@@ -1610,6 +1609,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -1632,6 +1634,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
goto unlink_out;
}
+ cifs_close_all_deferred_files(tcon);
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -1872,6 +1875,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -1954,6 +1959,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
}
cifs_sb = CIFS_SB(inode->i_sb);
+ if (unlikely(cifs_forced_shutdown(cifs_sb))) {
+ rc = -EIO;
+ goto rmdir_exit;
+ }
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
rc = PTR_ERR(tlink);
@@ -2088,6 +2098,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
return -EINVAL;
cifs_sb = CIFS_SB(source_dir->i_sb);
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -2109,6 +2122,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
goto cifs_rename_exit;
}
+ cifs_close_all_deferred_files(tcon);
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
@@ -2404,6 +2418,9 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct inode *inode = d_inode(dentry);
int rc;
+ if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+ return -EIO;
+
/*
* We need to be sure that all dirty pages are written and the server
* has actual ctime, mtime and file length.
@@ -2476,6 +2493,9 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
struct cifsFileInfo *cfile;
int rc;
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
/*
* We need to be sure that all dirty pages are written as they
* might fill holes on the server.
@@ -2962,6 +2982,9 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
int rc, retries = 0;
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
do {
if (pTcon->unix_ext)
rc = cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 08d99fec593e..d67d281ab863 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -33,6 +33,7 @@
#include "cifsfs.h"
#include "cifs_ioctl.h"
#include "smb2proto.h"
+#include "smb2glob.h"
#include <linux/btrfs.h>
static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
@@ -164,6 +165,164 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+static int cifs_shutdown(struct super_block *sb, unsigned long arg)
+{
+ struct cifs_sb_info *sbi = CIFS_SB(sb);
+ __u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH)
+ return -EINVAL;
+
+ if (cifs_forced_shutdown(sbi))
+ return 0;
+
+ cifs_dbg(VFS, "shut down requested (%d)", flags);
+/* trace_cifs_shutdown(sb, flags);*/
+
+ /*
+ * see:
+ * https://man7.org/linux/man-pages/man2/ioctl_xfs_goingdown.2.html
+ * for more information and description of original intent of the flags
+ */
+ switch (flags) {
+ /*
+ * We could add support later for default flag which requires:
+ * "Flush all dirty data and metadata to disk"
+ * would need to call syncfs or equivalent to flush page cache for
+ * the mount and then issue fsync to server (if nostrictsync not set)
+ */
+ case CIFS_GOING_FLAGS_DEFAULT:
+ cifs_dbg(FYI, "shutdown with default flag not supported\n");
+ return -EINVAL;
+ /*
+ * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not
+ * data) but metadata writes are not cached on the client, so can treat
+ * it similarly to NOLOGFLUSH
+ */
+ case CIFS_GOING_FLAGS_LOGFLUSH:
+ case CIFS_GOING_FLAGS_NOLOGFLUSH:
+ sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in)
+{
+ struct smb3_full_key_debug_info out;
+ struct cifs_ses *ses;
+ int rc = 0;
+ bool found = false;
+ u8 __user *end;
+
+ if (!smb3_encryption_required(tcon)) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* copy user input into our output buffer */
+ if (copy_from_user(&out, in, sizeof(out))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (!out.session_id) {
+ /* if ses id is 0, use current user session */
+ ses = tcon->ses;
+ } else {
+ /* otherwise if a session id is given, look for it in all our sessions */
+ struct cifs_ses *ses_it = NULL;
+ struct TCP_Server_Info *server_it = NULL;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) {
+ if (ses_it->Suid == out.session_id) {
+ ses = ses_it;
+ /*
+ * since we are using the session outside the crit
+ * section, we need to make sure it won't be released
+ * so increment its refcount
+ */
+ ses->ses_count++;
+ found = true;
+ goto search_end;
+ }
+ }
+ }
+search_end:
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (!found) {
+ rc = -ENOENT;
+ goto out;
+ }
+ }
+
+ switch (ses->server->cipher_type) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ case SMB2_ENCRYPTION_AES128_GCM:
+ out.session_key_length = CIFS_SESS_KEY_SIZE;
+ out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE;
+ break;
+ case SMB2_ENCRYPTION_AES256_CCM:
+ case SMB2_ENCRYPTION_AES256_GCM:
+ out.session_key_length = CIFS_SESS_KEY_SIZE;
+ out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* check if user buffer is big enough to store all the keys */
+ if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length
+ + out.server_out_key_length) {
+ rc = -ENOBUFS;
+ goto out;
+ }
+
+ out.session_id = ses->Suid;
+ out.cipher_type = le16_to_cpu(ses->server->cipher_type);
+
+ /* overwrite user input with our output */
+ if (copy_to_user(in, &out, sizeof(out))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* append all the keys at the end of the user buffer */
+ end = in->data;
+ if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ end += out.session_key_length;
+
+ if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ end += out.server_in_key_length;
+
+ if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ if (found)
+ cifs_put_smb_ses(ses);
+ return rc;
+}
+
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
@@ -277,6 +436,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
break;
case CIFS_DUMP_KEY:
+ /*
+ * Dump encryption keys. This is an old ioctl that only
+ * handles AES-128-{CCM,GCM}.
+ */
if (pSMBFile == NULL)
break;
if (!capable(CAP_SYS_ADMIN)) {
@@ -304,6 +467,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = 0;
break;
+ case CIFS_DUMP_FULL_KEY:
+ /*
+ * Dump encryption keys (handles any key sizes)
+ */
+ if (pSMBFile == NULL)
+ break;
+ if (!capable(CAP_SYS_ADMIN)) {
+ rc = -EACCES;
+ break;
+ }
+ tcon = tlink_tcon(pSMBFile->tlink);
+ rc = cifs_dump_full_key(tcon, (void __user *)arg);
+ break;
case CIFS_IOC_NOTIFY:
if (!S_ISDIR(inode->i_mode)) {
/* Notify can only be done on directories */
@@ -325,6 +501,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
cifs_put_tlink(tlink);
break;
+ case CIFS_IOC_SHUTDOWN:
+ rc = cifs_shutdown(inode->i_sb, arg);
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 616e1bc0cc0a..970fcf2adb08 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -30,6 +30,7 @@
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "smb2proto.h"
+#include "cifs_ioctl.h"
/*
* M-F Symlink Functions - Begin
@@ -518,6 +519,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
struct TCP_Server_Info *server;
struct cifsInodeInfo *cifsInode;
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -679,9 +683,16 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
const char *full_path;
- void *page = alloc_dentry_path();
+ void *page;
struct inode *newinode = NULL;
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
+ page = alloc_dentry_path();
+ if (!page)
+ return -ENOMEM;
+
xid = get_xid();
tlink = cifs_sb_tlink(cifs_sb);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c15a90e422be..7207a63819cb 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -672,6 +672,100 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ * As there is no reference count on cifs_deferred_close, pdclose
+ * should not be used outside deferred_lock.
+ */
+bool
+cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose)
+{
+ struct cifs_deferred_close *dclose;
+
+ list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) {
+ if ((dclose->netfid == cfile->fid.netfid) &&
+ (dclose->persistent_fid == cfile->fid.persistent_fid) &&
+ (dclose->volatile_fid == cfile->fid.volatile_fid)) {
+ *pdclose = dclose;
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ */
+void
+cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose)
+{
+ bool is_deferred = false;
+ struct cifs_deferred_close *pdclose;
+
+ is_deferred = cifs_is_deferred_close(cfile, &pdclose);
+ if (is_deferred) {
+ kfree(dclose);
+ return;
+ }
+
+ dclose->tlink = cfile->tlink;
+ dclose->netfid = cfile->fid.netfid;
+ dclose->persistent_fid = cfile->fid.persistent_fid;
+ dclose->volatile_fid = cfile->fid.volatile_fid;
+ list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes);
+}
+
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ */
+void
+cifs_del_deferred_close(struct cifsFileInfo *cfile)
+{
+ bool is_deferred = false;
+ struct cifs_deferred_close *dclose;
+
+ is_deferred = cifs_is_deferred_close(cfile, &dclose);
+ if (!is_deferred)
+ return;
+ list_del(&dclose->dlist);
+ kfree(dclose);
+}
+
+void
+cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
+{
+ struct cifsFileInfo *cfile = NULL;
+ struct cifs_deferred_close *dclose;
+
+ list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
+ spin_lock(&cifs_inode->deferred_lock);
+ if (cifs_is_deferred_close(cfile, &dclose))
+ mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+ spin_unlock(&cifs_inode->deferred_lock);
+ }
+}
+
+void
+cifs_close_all_deferred_files(struct cifs_tcon *tcon)
+{
+ struct cifsFileInfo *cfile;
+ struct list_head *tmp;
+
+ spin_lock(&tcon->open_file_lock);
+ list_for_each(tmp, &tcon->openFileList) {
+ cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ if (delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
+ cifsFileInfo_get(cfile);
+ }
+ }
+ spin_unlock(&tcon->open_file_lock);
+}
+
/* parses DFS refferal V3 structure
* caller is responsible for freeing target_nodes
* returns:
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 63d517b9f2ff..a92a1fb7cb52 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -97,6 +97,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
return 0;
}
+ if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname);
+ ses->chan_max = 1;
+ return 0;
+ }
+
/*
* Make a copy of the iface list at the time and use that
* instead so as to not hold the iface spinlock for opening
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index dd0eb665b680..21ef51d338e0 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1861,6 +1861,8 @@ smb2_copychunk_range(const unsigned int xid,
cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
/* Request server copy to target from src identified by key */
+ kfree(retbuf);
+ retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
true /* is_fsctl */, (char *)pcchunk,
@@ -3981,6 +3983,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
oplock &= 0xFF;
+ cinode->lease_granted = false;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
@@ -4007,6 +4010,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int new_oplock = 0;
oplock &= 0xFF;
+ cinode->lease_granted = true;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index e36c2a867783..c205f93e0a10 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -841,6 +841,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
req->SecurityMode = 0;
req->Capabilities = cpu_to_le32(server->vals->req_capabilities);
+ if (ses->chan_max > 1)
+ req->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL);
/* ClientGUID must be zero for SMB2.02 dialect */
if (server->vals->protocol_id == SMB20_PROT_ID)
@@ -956,6 +958,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* Internal types */
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
+ /*
+ * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context
+ * Set the cipher type manually.
+ */
+ if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+ server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
+
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
(struct smb2_sync_hdr *)rsp);
/*
@@ -1032,6 +1041,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->Capabilities =
cpu_to_le32(server->vals->req_capabilities);
+ if (tcon->ses->chan_max > 1)
+ pneg_inbuf->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL);
+
memcpy(pneg_inbuf->Guid, server->client_guid,
SMB2_CLIENT_GUID_SIZE);
@@ -3895,10 +3907,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFF;
+ shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFF;
+ req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
+ req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
}
}
if (remaining_bytes > io_parms->length)
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d6df908dccad..dafcb6ab050d 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -12,6 +12,11 @@
#include <linux/tracepoint.h>
+/*
+ * Please use this 3-part article as a reference for writing new tracepoints:
+ * https://lwn.net/Articles/379903/
+ */
+
/* For logging errors in read or write */
DECLARE_EVENT_CLASS(smb3_rw_err_class,
TP_PROTO(unsigned int xid,
@@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
TP_ARGS(xid, func_name, rc),
TP_STRUCT__entry(
__field(unsigned int, xid)
- __field(const char *, func_name)
+ __string(func_name, func_name)
__field(int, rc)
),
TP_fast_assign(
__entry->xid = xid;
- __entry->func_name = func_name;
+ __assign_str(func_name, func_name);
__entry->rc = rc;
),
TP_printk("\t%s: xid=%u rc=%d",
- __entry->func_name, __entry->xid, __entry->rc)
+ __get_str(func_name), __entry->xid, __entry->rc)
)
#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \
@@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
TP_ARGS(xid, func_name),
TP_STRUCT__entry(
__field(unsigned int, xid)
- __field(const char *, func_name)
+ __string(func_name, func_name)
),
TP_fast_assign(
__entry->xid = xid;
- __entry->func_name = func_name;
+ __assign_str(func_name, func_name);
),
TP_printk("\t%s: xid=%u",
- __entry->func_name, __entry->xid)
+ __get_str(func_name), __entry->xid)
)
#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \
@@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_STRUCT__entry(
__field(__u64, currmid)
__field(__u64, conn_id)
- __field(char *, hostname)
+ __string(hostname, hostname)
),
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __entry->hostname = hostname;
+ __assign_str(hostname, hostname);
),
TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
__entry->conn_id,
- __entry->hostname,
+ __get_str(hostname),
__entry->currmid)
)
@@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_STRUCT__entry(
__field(__u64, currmid)
__field(__u64, conn_id)
- __field(char *, hostname)
+ __string(hostname, hostname)
__field(int, credits)
__field(int, credits_to_add)
__field(int, in_flight)
@@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __entry->hostname = hostname;
+ __assign_str(hostname, hostname);
__entry->credits = credits;
__entry->credits_to_add = credits_to_add;
__entry->in_flight = in_flight;
@@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_printk("conn_id=0x%llx server=%s current_mid=%llu "
"credits=%d credit_change=%d in_flight=%d",
__entry->conn_id,
- __entry->hostname,
+ __get_str(hostname),
__entry->currmid,
__entry->credits,
__entry->credits_to_add,
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index e351b945135b..aa3e8ca0457c 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,6 +30,7 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
+#include "cifs_ioctl.h"
#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */
@@ -421,6 +422,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
const char *full_path;
void *page;
+ if (unlikely(cifs_forced_shutdown(cifs_sb)))
+ return -EIO;
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
return -EOPNOTSUPP;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 9a3aed249692..c0395363eab9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -1,7 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset:8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* configfs_internal.h - Internal stuff for configfs
*
* Based on sysfs:
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b6098e02e20b..ac5e0c0e9181 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* dir.c - Operations for configfs directories.
*
* Based on sysfs:
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index da8351d1e455..e26060dae70a 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* file.c - operations for regular (text) files.
*
* Based on sysfs:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 42c348bb2903..eb5ec3e46283 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* inode.c - basic inode and dentry operations.
*
* Based on sysfs:
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 704a4356f137..254170a82aa3 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* item.c - library routines for handling generic config items
*
* Based on kobject:
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 0c6e8cf61953..c2d820063ec4 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* mount.c - operations for initializing and mounting configfs.
*
* Based on sysfs:
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 77c854364e60..0623c3edcfb9 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* symlink.c - operations for configfs symlinks.
*
* Based on sysfs:
diff --git a/fs/d_path.c b/fs/d_path.c
index a69e2cd36e6e..270d62133996 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
/*
* Write full pathname from the root of the filesystem into the buffer.
*/
-static char *__dentry_path(struct dentry *d, char *buf, int buflen)
+static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
{
- struct dentry *dentry;
+ const struct dentry *dentry;
char *end, *retval;
int len, seq = 0;
int error = 0;
@@ -347,7 +347,7 @@ restart:
*retval = '/';
read_seqbegin_or_lock(&rename_lock, &seq);
while (!IS_ROOT(dentry)) {
- struct dentry *parent = dentry->d_parent;
+ const struct dentry *parent = dentry->d_parent;
prefetch(parent);
error = prepend_name(&end, &len, &dentry->d_name);
@@ -371,13 +371,13 @@ Elong:
return ERR_PTR(-ENAMETOOLONG);
}
-char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
return __dentry_path(dentry, buf, buflen);
}
EXPORT_SYMBOL(dentry_path_raw);
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
char *p = NULL;
char *retval;
diff --git a/fs/dax.c b/fs/dax.c
index b3d27fdc6775..62352cbcf0f4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -144,6 +144,16 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
+/**
+ * enum dax_wake_mode: waitqueue wakeup behaviour
+ * @WAKE_ALL: wake all waiters in the waitqueue
+ * @WAKE_NEXT: wake only the first waiter in the waitqueue
+ */
+enum dax_wake_mode {
+ WAKE_ALL,
+ WAKE_NEXT,
+};
+
static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
void *entry, struct exceptional_entry_key *key)
{
@@ -182,7 +192,8 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
-static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry,
+ enum dax_wake_mode mode)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
@@ -196,7 +207,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
* must be in the waitqueue and the following check will see them.
*/
if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+ __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
}
/*
@@ -264,11 +275,11 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
finish_wait(wq, &ewait.wait);
}
-static void put_unlocked_entry(struct xa_state *xas, void *entry)
+static void put_unlocked_entry(struct xa_state *xas, void *entry,
+ enum dax_wake_mode mode)
{
- /* If we were the only waiter woken, wake the next one */
if (entry && !dax_is_conflict(entry))
- dax_wake_entry(xas, entry, false);
+ dax_wake_entry(xas, entry, mode);
}
/*
@@ -286,7 +297,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry)
old = xas_store(xas, entry);
xas_unlock_irq(xas);
BUG_ON(!dax_is_locked(old));
- dax_wake_entry(xas, entry, false);
+ dax_wake_entry(xas, entry, WAKE_NEXT);
}
/*
@@ -524,8 +535,8 @@ retry:
dax_disassociate_entry(entry, mapping, false);
xas_store(xas, NULL); /* undo the PMD join */
- dax_wake_entry(xas, entry, true);
- mapping->nrexceptional--;
+ dax_wake_entry(xas, entry, WAKE_ALL);
+ mapping->nrpages -= PG_PMD_NR;
entry = NULL;
xas_set(xas, index);
}
@@ -541,7 +552,7 @@ retry:
dax_lock_entry(xas, entry);
if (xas_error(xas))
goto out_unlock;
- mapping->nrexceptional++;
+ mapping->nrpages += 1UL << order;
}
out_unlock:
@@ -622,7 +633,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
entry = get_unlocked_entry(&xas, 0);
if (entry)
page = dax_busy_page(entry);
- put_unlocked_entry(&xas, entry);
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
if (page)
break;
if (++scanned % XA_CHECK_SCHED)
@@ -661,10 +672,10 @@ static int __dax_invalidate_entry(struct address_space *mapping,
goto out;
dax_disassociate_entry(entry, mapping, trunc);
xas_store(&xas, NULL);
- mapping->nrexceptional--;
+ mapping->nrpages -= 1UL << dax_entry_order(entry);
ret = 1;
out:
- put_unlocked_entry(&xas, entry);
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
xas_unlock_irq(&xas);
return ret;
}
@@ -937,13 +948,13 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_lock_irq(xas);
xas_store(xas, entry);
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
- dax_wake_entry(xas, entry, false);
+ dax_wake_entry(xas, entry, WAKE_NEXT);
trace_dax_writeback_one(mapping->host, index, count);
return ret;
put_unlocked:
- put_unlocked_entry(xas, entry);
+ put_unlocked_entry(xas, entry, WAKE_NEXT);
return ret;
}
@@ -965,7 +976,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
- if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+ if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
return 0;
trace_dax_writeback_range(inode, xas.xa_index, end_index);
@@ -1684,7 +1695,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
/* Did we race with someone splitting entry or so? */
if (!entry || dax_is_conflict(entry) ||
(order == 0 && !dax_is_pte_entry(entry))) {
- put_unlocked_entry(&xas, entry);
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
diff --git a/fs/dcache.c b/fs/dcache.c
index 7d24ff7eb206..cf871a81f4fd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
+const struct qstr dotdot_name = QSTR_INIT("..", 2);
+EXPORT_SYMBOL(dotdot_name);
/*
* This is the single most critical data structure when it comes
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 1d252164d97b..8129a430d789 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -45,10 +45,13 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;
static int debugfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *ia)
{
- int ret = security_locked_down(LOCKDOWN_DEBUGFS);
+ int ret;
- if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
- return ret;
+ if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) {
+ ret = security_locked_down(LOCKDOWN_DEBUGFS);
+ if (ret)
+ return ret;
+ }
return simple_setattr(&init_user_ns, dentry, ia);
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 49c5f9407098..88d95d96e36c 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item,
CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, bool (*check_cb)(unsigned int x),
+ int *info_field, int (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -137,8 +137,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
if (rc)
return rc;
- if (check_cb && check_cb(x))
- return -EINVAL;
+ if (check_cb) {
+ rc = check_cb(x);
+ if (rc)
+ return rc;
+ }
*cl_field = x;
*info_field = x;
@@ -161,17 +164,53 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
} \
CONFIGFS_ATTR(cluster_, name);
-static bool dlm_check_zero(unsigned int x)
+static int dlm_check_protocol_and_dlm_running(unsigned int x)
+{
+ switch (x) {
+ case 0:
+ /* TCP */
+ break;
+ case 1:
+ /* SCTP */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (dlm_allow_conn)
+ return -EBUSY;
+
+ return 0;
+}
+
+static int dlm_check_zero_and_dlm_running(unsigned int x)
+{
+ if (!x)
+ return -EINVAL;
+
+ if (dlm_allow_conn)
+ return -EBUSY;
+
+ return 0;
+}
+
+static int dlm_check_zero(unsigned int x)
{
- return !x;
+ if (!x)
+ return -EINVAL;
+
+ return 0;
}
-static bool dlm_check_buffer_size(unsigned int x)
+static int dlm_check_buffer_size(unsigned int x)
{
- return (x < DEFAULT_BUFFER_SIZE);
+ if (x < DEFAULT_BUFFER_SIZE)
+ return -EINVAL;
+
+ return 0;
}
-CLUSTER_ATTR(tcp_port, dlm_check_zero);
+CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running);
CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
CLUSTER_ATTR(recover_timer, dlm_check_zero);
@@ -179,7 +218,7 @@ CLUSTER_ATTR(toss_secs, dlm_check_zero);
CLUSTER_ATTR(scan_secs, dlm_check_zero);
CLUSTER_ATTR(log_debug, NULL);
CLUSTER_ATTR(log_info, NULL);
-CLUSTER_ATTR(protocol, NULL);
+CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
CLUSTER_ATTR(mark, NULL);
CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
CLUSTER_ATTR(waitwarn_us, NULL);
@@ -688,6 +727,7 @@ static ssize_t comm_mark_show(struct config_item *item, char *buf)
static ssize_t comm_mark_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_comm *comm;
unsigned int mark;
int rc;
@@ -695,7 +735,15 @@ static ssize_t comm_mark_store(struct config_item *item, const char *buf,
if (rc)
return rc;
- config_item_to_comm(item)->mark = mark;
+ if (mark == 0)
+ mark = dlm_config.ci_mark;
+
+ comm = config_item_to_comm(item);
+ rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark);
+ if (rc)
+ return rc;
+
+ comm->mark = mark;
return len;
}
@@ -870,24 +918,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
return 0;
}
-void dlm_comm_mark(int nodeid, unsigned int *mark)
-{
- struct dlm_comm *cm;
-
- cm = get_comm(nodeid);
- if (!cm) {
- *mark = dlm_config.ci_mark;
- return;
- }
-
- if (cm->mark)
- *mark = cm->mark;
- else
- *mark = dlm_config.ci_mark;
-
- put_comm(cm);
-}
-
int dlm_our_nodeid(void)
{
return local_comm ? local_comm->nodeid : 0;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index c210250a2581..d2cd4bd20313 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -48,7 +48,6 @@ void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
int dlm_comm_seq(int nodeid, uint32_t *seq);
-void dlm_comm_mark(int nodeid, unsigned int *mark);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index d6bbccb0ed15..d5bd990bcab8 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -542,6 +542,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
if (bucket >= ls->ls_rsbtbl_size) {
kfree(ri);
+ ++*pos;
return NULL;
}
tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 002123efc6b0..b93df39d0915 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3541,8 +3541,6 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
if (!mh)
return -ENOBUFS;
- memset(mb, 0, mb_len);
-
ms = (struct dlm_message *) mb;
ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 561dcad08ad6..c14cf2b7faab 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -404,12 +404,6 @@ static int threads_start(void)
return error;
}
-static void threads_stop(void)
-{
- dlm_scand_stop();
- dlm_lowcomms_stop();
-}
-
static int new_lockspace(const char *name, const char *cluster,
uint32_t flags, int lvblen,
const struct dlm_lockspace_ops *ops, void *ops_arg,
@@ -702,8 +696,11 @@ int dlm_new_lockspace(const char *name, const char *cluster,
ls_count++;
if (error > 0)
error = 0;
- if (!ls_count)
- threads_stop();
+ if (!ls_count) {
+ dlm_scand_stop();
+ dlm_lowcomms_shutdown();
+ dlm_lowcomms_stop();
+ }
out:
mutex_unlock(&ls_lock);
return error;
@@ -788,6 +785,11 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_recoverd_stop(ls);
+ if (ls_count == 1) {
+ dlm_scand_stop();
+ dlm_lowcomms_shutdown();
+ }
+
dlm_callback_stop(ls);
remove_lockspace(ls);
@@ -880,7 +882,7 @@ int dlm_release_lockspace(void *lockspace, int force)
if (!error)
ls_count--;
if (!ls_count)
- threads_stop();
+ dlm_lowcomms_stop();
mutex_unlock(&ls_lock);
return error;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 372c34ff8594..166e36fcf3e4 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -102,6 +102,9 @@ struct listen_connection {
struct work_struct rwork;
};
+#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end)
+#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset)
+
/* An entry waiting to be sent */
struct writequeue_entry {
struct list_head list;
@@ -116,6 +119,7 @@ struct writequeue_entry {
struct dlm_node_addr {
struct list_head list;
int nodeid;
+ int mark;
int addr_count;
int curr_addr_index;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
@@ -134,7 +138,7 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin);
static struct listen_connection listen_con;
static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
-static int dlm_allow_conn;
+int dlm_allow_conn;
/* Work queues */
static struct workqueue_struct *recv_workqueue;
@@ -303,7 +307,8 @@ static int addr_compare(const struct sockaddr_storage *x,
}
static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
- struct sockaddr *sa_out, bool try_new_addr)
+ struct sockaddr *sa_out, bool try_new_addr,
+ unsigned int *mark)
{
struct sockaddr_storage sas;
struct dlm_node_addr *na;
@@ -331,6 +336,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
if (!na->addr_count)
return -ENOENT;
+ *mark = na->mark;
+
if (sas_out)
memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
@@ -350,7 +357,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
return 0;
}
-static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
+ unsigned int *mark)
{
struct dlm_node_addr *na;
int rv = -EEXIST;
@@ -364,6 +372,7 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
if (addr_compare(na->addr[addr_i], addr)) {
*nodeid = na->nodeid;
+ *mark = na->mark;
rv = 0;
goto unlock;
}
@@ -412,6 +421,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
new_node->nodeid = nodeid;
new_node->addr[0] = new_addr;
new_node->addr_count = 1;
+ new_node->mark = dlm_config.ci_mark;
list_add(&new_node->list, &dlm_node_addrs);
spin_unlock(&dlm_node_addrs_spin);
return 0;
@@ -519,6 +529,23 @@ int dlm_lowcomms_connect_node(int nodeid)
return 0;
}
+int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
+{
+ struct dlm_node_addr *na;
+
+ spin_lock(&dlm_node_addrs_spin);
+ na = find_node_addr(nodeid);
+ if (!na) {
+ spin_unlock(&dlm_node_addrs_spin);
+ return -ENOENT;
+ }
+
+ na->mark = mark;
+ spin_unlock(&dlm_node_addrs_spin);
+
+ return 0;
+}
+
static void lowcomms_error_report(struct sock *sk)
{
struct connection *con;
@@ -685,10 +712,7 @@ static void shutdown_connection(struct connection *con)
{
int ret;
- if (cancel_work_sync(&con->swork)) {
- log_print("canceled swork for node %d", con->nodeid);
- clear_bit(CF_WRITE_PENDING, &con->flags);
- }
+ flush_work(&con->swork);
mutex_lock(&con->sock_mutex);
/* nothing to shutdown */
@@ -867,7 +891,7 @@ static int accept_from_sock(struct listen_connection *con)
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len);
- if (addr_to_nodeid(&peeraddr, &nodeid)) {
+ if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) {
unsigned char *b=(unsigned char *)&peeraddr;
log_print("connect from non cluster node");
print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
@@ -876,9 +900,6 @@ static int accept_from_sock(struct listen_connection *con)
return -1;
}
- dlm_comm_mark(nodeid, &mark);
- sock_set_mark(newsock->sk, mark);
-
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
@@ -892,6 +913,8 @@ static int accept_from_sock(struct listen_connection *con)
goto accept_err;
}
+ sock_set_mark(newsock->sk, mark);
+
mutex_lock(&newcon->sock_mutex);
if (newcon->sock) {
struct connection *othercon = newcon->othercon;
@@ -908,16 +931,18 @@ static int accept_from_sock(struct listen_connection *con)
result = dlm_con_init(othercon, nodeid);
if (result < 0) {
kfree(othercon);
+ mutex_unlock(&newcon->sock_mutex);
goto accept_err;
}
+ lockdep_set_subclass(&othercon->sock_mutex, 1);
newcon->othercon = othercon;
} else {
/* close other sock con if we have something new */
close_connection(othercon, false, true, false);
}
- mutex_lock_nested(&othercon->sock_mutex, 1);
+ mutex_lock(&othercon->sock_mutex);
add_sock(newsock, othercon);
addcon = othercon;
mutex_unlock(&othercon->sock_mutex);
@@ -930,6 +955,7 @@ static int accept_from_sock(struct listen_connection *con)
addcon = newcon;
}
+ set_bit(CF_CONNECTED, &addcon->flags);
mutex_unlock(&newcon->sock_mutex);
/*
@@ -1015,8 +1041,6 @@ static void sctp_connect_to_sock(struct connection *con)
struct socket *sock;
unsigned int mark;
- dlm_comm_mark(con->nodeid, &mark);
-
mutex_lock(&con->sock_mutex);
/* Some odd races can cause double-connects, ignore them */
@@ -1029,7 +1053,7 @@ static void sctp_connect_to_sock(struct connection *con)
}
memset(&daddr, 0, sizeof(daddr));
- result = nodeid_to_addr(con->nodeid, &daddr, NULL, true);
+ result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out;
@@ -1104,13 +1128,11 @@ out:
static void tcp_connect_to_sock(struct connection *con)
{
struct sockaddr_storage saddr, src_addr;
+ unsigned int mark;
int addr_len;
struct socket *sock = NULL;
- unsigned int mark;
int result;
- dlm_comm_mark(con->nodeid, &mark);
-
mutex_lock(&con->sock_mutex);
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
@@ -1125,15 +1147,15 @@ static void tcp_connect_to_sock(struct connection *con)
if (result < 0)
goto out_err;
- sock_set_mark(sock->sk, mark);
-
memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out_err;
}
+ sock_set_mark(sock->sk, mark);
+
add_sock(sock, con);
/* Bind to our cluster-known address connecting to avoid
@@ -1330,70 +1352,72 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
{
struct writequeue_entry *entry;
- entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+ entry = kzalloc(sizeof(*entry), allocation);
if (!entry)
return NULL;
- entry->page = alloc_page(allocation);
+ entry->page = alloc_page(allocation | __GFP_ZERO);
if (!entry->page) {
kfree(entry);
return NULL;
}
- entry->offset = 0;
- entry->len = 0;
- entry->end = 0;
- entry->users = 0;
entry->con = con;
+ entry->users = 1;
return entry;
}
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
+ gfp_t allocation, char **ppc)
{
- struct connection *con;
struct writequeue_entry *e;
- int offset = 0;
- if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) {
- BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN);
- log_print("failed to allocate a buffer of size %d", len);
- return NULL;
+ spin_lock(&con->writequeue_lock);
+ if (!list_empty(&con->writequeue)) {
+ e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
+ if (DLM_WQ_REMAIN_BYTES(e) >= len) {
+ *ppc = page_address(e->page) + e->end;
+ e->end += len;
+ e->users++;
+ spin_unlock(&con->writequeue_lock);
+
+ return e;
+ }
}
+ spin_unlock(&con->writequeue_lock);
- con = nodeid2con(nodeid, allocation);
- if (!con)
+ e = new_writequeue_entry(con, allocation);
+ if (!e)
return NULL;
+ *ppc = page_address(e->page);
+ e->end += len;
+
spin_lock(&con->writequeue_lock);
- e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
- if ((&e->list == &con->writequeue) ||
- (PAGE_SIZE - e->end < len)) {
- e = NULL;
- } else {
- offset = e->end;
- e->end += len;
- e->users++;
- }
+ list_add_tail(&e->list, &con->writequeue);
spin_unlock(&con->writequeue_lock);
- if (e) {
- got_one:
- *ppc = page_address(e->page) + offset;
- return e;
- }
+ return e;
+};
- e = new_writequeue_entry(con, allocation);
- if (e) {
- spin_lock(&con->writequeue_lock);
- offset = e->end;
- e->end += len;
- e->users++;
- list_add_tail(&e->list, &con->writequeue);
- spin_unlock(&con->writequeue_lock);
- goto got_one;
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+ struct connection *con;
+
+ if (len > DEFAULT_BUFFER_SIZE ||
+ len < sizeof(struct dlm_header)) {
+ BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE);
+ log_print("failed to allocate a buffer of size %d", len);
+ WARN_ON(1);
+ return NULL;
}
- return NULL;
+
+ con = nodeid2con(nodeid, allocation);
+ if (!con)
+ return NULL;
+
+ return new_wq_entry(con, len, allocation, ppc);
}
void dlm_lowcomms_commit_buffer(void *mh)
@@ -1406,7 +1430,8 @@ void dlm_lowcomms_commit_buffer(void *mh)
users = --e->users;
if (users)
goto out;
- e->len = e->end - e->offset;
+
+ e->len = DLM_WQ_LENGTH_BYTES(e);
spin_unlock(&con->writequeue_lock);
queue_work(send_workqueue, &con->swork);
@@ -1432,11 +1457,10 @@ static void send_to_sock(struct connection *con)
spin_lock(&con->writequeue_lock);
for (;;) {
- e = list_entry(con->writequeue.next, struct writequeue_entry,
- list);
- if ((struct list_head *) e == &con->writequeue)
+ if (list_empty(&con->writequeue))
break;
+ e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
BUG_ON(len == 0 && e->users == 0);
@@ -1589,6 +1613,29 @@ static int work_start(void)
return 0;
}
+static void shutdown_conn(struct connection *con)
+{
+ if (con->shutdown_action)
+ con->shutdown_action(con);
+}
+
+void dlm_lowcomms_shutdown(void)
+{
+ /* Set all the flags to prevent any
+ * socket activity.
+ */
+ dlm_allow_conn = 0;
+
+ if (recv_workqueue)
+ flush_workqueue(recv_workqueue);
+ if (send_workqueue)
+ flush_workqueue(send_workqueue);
+
+ dlm_close_sock(&listen_con.sock);
+
+ foreach_conn(shutdown_conn);
+}
+
static void _stop_conn(struct connection *con, bool and_other)
{
mutex_lock(&con->sock_mutex);
@@ -1610,12 +1657,6 @@ static void stop_conn(struct connection *con)
_stop_conn(con, true);
}
-static void shutdown_conn(struct connection *con)
-{
- if (con->shutdown_action)
- con->shutdown_action(con);
-}
-
static void connection_release(struct rcu_head *rcu)
{
struct connection *con = container_of(rcu, struct connection, rcu);
@@ -1672,19 +1713,6 @@ static void work_flush(void)
void dlm_lowcomms_stop(void)
{
- /* Set all the flags to prevent any
- socket activity.
- */
- dlm_allow_conn = 0;
-
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
-
- dlm_close_sock(&listen_con.sock);
-
- foreach_conn(shutdown_conn);
work_flush();
foreach_conn(free_conn);
work_stop();
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 0918f9376489..48bbc4e18761 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -14,13 +14,18 @@
#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096
+/* switch to check if dlm is running */
+extern int dlm_allow_conn;
+
int dlm_lowcomms_start(void);
+void dlm_lowcomms_shutdown(void);
void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
void dlm_lowcomms_commit_buffer(void *mh);
int dlm_lowcomms_connect_node(int nodeid);
+int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
#endif /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index fde3a6afe4be..1c6654a21ec4 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -22,8 +22,6 @@
* into packets and sends them to the comms layer.
*/
-#include <asm/unaligned.h>
-
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
@@ -45,13 +43,22 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
while (len >= sizeof(struct dlm_header)) {
hd = (struct dlm_header *)ptr;
- /* no message should be more than this otherwise we
- * cannot deliver this message to upper layers
+ /* no message should be more than DEFAULT_BUFFER_SIZE or
+ * less than dlm_header size.
+ *
+ * Some messages does not have a 8 byte length boundary yet
+ * which can occur in a unaligned memory access of some dlm
+ * messages. However this problem need to be fixed at the
+ * sending side, for now it seems nobody run into architecture
+ * related issues yet but it slows down some processing.
+ * Fixing this issue should be scheduled in future by doing
+ * the next major version bump.
*/
- msglen = get_unaligned_le16(&hd->h_length);
- if (msglen > DEFAULT_BUFFER_SIZE) {
- log_print("received invalid length header: %u, will abort message parsing",
- msglen);
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > DEFAULT_BUFFER_SIZE ||
+ msglen < sizeof(struct dlm_header)) {
+ log_print("received invalid length header: %u from node %d, will abort message parsing",
+ msglen, nodeid);
return -EBADMSG;
}
@@ -84,15 +91,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
goto skip;
}
- /* for aligned memory access, we just copy current message
- * to begin of the buffer which contains already parsed buffer
- * data and should provide align access for upper layers
- * because the start address of the buffer has a aligned
- * address. This memmove can be removed when the upperlayer
- * is capable of unaligned memory access.
- */
- memmove(buf, ptr, msglen);
- dlm_receive_buffer((union dlm_packet *)buf, nodeid);
+ dlm_receive_buffer((union dlm_packet *)ptr, nodeid);
skip:
ret += msglen;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 73ddee5159d7..f5b1bd65728d 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -41,7 +41,6 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
to_nodeid, type, len);
return -ENOBUFS;
}
- memset(mb, 0, mb_len);
rc = (struct dlm_rcom *) mb;
@@ -462,7 +461,6 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
- memset(mb, 0, mb_len);
rc = (struct dlm_rcom *) mb;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 943e523f4c9d..e3f5d7f3c8a0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2004 Erez Zadok
@@ -296,10 +296,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct extent_crypt_result ecr;
int rc = 0;
- if (!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
- return -EINVAL;
-
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
@@ -350,7 +346,7 @@ out:
return rc;
}
-/**
+/*
* lower_offset_for_page
*
* Convert an eCryptfs page index into a lower byte offset
@@ -535,7 +531,7 @@ int ecryptfs_decrypt_page(struct page *page)
rc = crypt_extent(crypt_stat, page, page,
extent_offset, DECRYPT);
if (rc) {
- printk(KERN_ERR "%s: Error encrypting extent; "
+ printk(KERN_ERR "%s: Error decrypting extent; "
"rc = [%d]\n", __func__, rc);
goto out;
}
@@ -627,9 +623,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
}
}
-/**
+/*
* ecryptfs_compute_root_iv
- * @crypt_stats
*
* On error, sets the root IV to all 0's.
*/
@@ -1370,7 +1365,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
return rc;
}
-/**
+/*
* ecryptfs_read_metadata
*
* Common entry point for reading file metadata. From here, we could
@@ -1448,7 +1443,7 @@ out:
return rc;
}
-/**
+/*
* ecryptfs_encrypt_filename - encrypt filename
*
* CBC-encrypts the filename. We do not want to encrypt the same
@@ -1590,11 +1585,10 @@ out:
struct kmem_cache *ecryptfs_key_tfm_cache;
static struct list_head key_tfm_list;
-struct mutex key_tfm_list_mutex;
+DEFINE_MUTEX(key_tfm_list_mutex);
int __init ecryptfs_init_crypto(void)
{
- mutex_init(&key_tfm_list_mutex);
INIT_LIST_HEAD(&key_tfm_list);
return 0;
}
@@ -1877,10 +1871,11 @@ out:
/**
* ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @encoded_name: The encrypted name
+ * @encoded_name_size: Length of the encrypted name
+ * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode
* @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
+ * @name_size: The length of the plaintext name
*
* Encrypts and encodes a filename into something that constitutes a
* valid filename for a filesystem, with printable characters.
@@ -1992,7 +1987,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size)
* ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
* @plaintext_name: The plaintext name
* @plaintext_name_size: The plaintext name size
- * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @sb: Ecryptfs's super_block
* @name: The filename in cipher text
* @name_size: The cipher text name size
*
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
index 1f65e99f9a41..cf6d0e8e25a1 100644
--- a/fs/ecryptfs/debug.c
+++ b/fs/ecryptfs/debug.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
* Functions only useful for debugging.
*
@@ -9,7 +9,7 @@
#include "ecryptfs_kernel.h"
-/**
+/*
* ecryptfs_dump_auth_tok - debug function to print auth toks
*
* This function will print the contents of an ecryptfs authentication
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 44606f079efb..acaa0825e9bb 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2003 Erez Zadok
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e6ac78c62ca4..5f2b49e13731 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -262,10 +262,7 @@ struct ecryptfs_inode_info {
* vfsmount too. */
struct ecryptfs_dentry_info {
struct path lower_path;
- union {
- struct ecryptfs_crypt_stat *crypt_stat;
- struct rcu_head rcu;
- };
+ struct rcu_head rcu;
};
/**
@@ -496,12 +493,6 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
}
-static inline struct ecryptfs_dentry_info *
-ecryptfs_dentry_to_private(struct dentry *dentry)
-{
- return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
-}
-
static inline void
ecryptfs_set_dentry_private(struct dentry *dentry,
struct ecryptfs_dentry_info *dentry_info)
@@ -515,12 +506,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
}
-static inline struct vfsmount *
-ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
-{
- return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
-}
-
static inline struct path *
ecryptfs_dentry_to_lower_path(struct dentry *dentry)
{
@@ -528,7 +513,7 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry)
}
#define ecryptfs_printk(type, fmt, arg...) \
- __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg)
__printf(1, 2)
void __ecryptfs_printk(const char *fmt, ...);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 5fb45d865ce5..18d5b91cb573 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2004 Erez Zadok
@@ -19,7 +19,7 @@
#include <linux/fs_stack.h>
#include "ecryptfs_kernel.h"
-/**
+/*
* ecryptfs_read_update_atime
*
* generic_file_read updates the atime of upper layer inode. But, it
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 7169ea873347..16d50dface59 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2004 Erez Zadok
@@ -22,19 +22,18 @@
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
-static struct dentry *lock_parent(struct dentry *dentry)
+static int lock_parent(struct dentry *dentry,
+ struct dentry **lower_dentry,
+ struct inode **lower_dir)
{
- struct dentry *dir;
+ struct dentry *lower_dir_dentry;
- dir = dget_parent(dentry);
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- return dir;
-}
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ *lower_dir = d_inode(lower_dir_dentry);
+ *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-static void unlock_dir(struct dentry *dir)
-{
- inode_unlock(d_inode(dir));
- dput(dir);
+ inode_lock_nested(*lower_dir, I_MUTEX_PARENT);
+ return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL;
}
static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
@@ -128,32 +127,29 @@ static int ecryptfs_interpose(struct dentry *lower_dentry,
static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct dentry *lower_dir_dentry;
- struct inode *lower_dir_inode;
+ struct dentry *lower_dentry;
+ struct inode *lower_dir;
int rc;
- lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
- lower_dir_inode = d_inode(lower_dir_dentry);
- inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ rc = lock_parent(dentry, &lower_dentry, &lower_dir);
dget(lower_dentry); // don't even try to make the lower negative
- if (lower_dentry->d_parent != lower_dir_dentry)
- rc = -EINVAL;
- else if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
- NULL);
+ if (!rc) {
+ if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry,
+ NULL);
+ }
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
}
- fsstack_copy_attr_times(dir, lower_dir_inode);
+ fsstack_copy_attr_times(dir, lower_dir);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode->i_ctime = dir->i_ctime;
out_unlock:
dput(lower_dentry);
- inode_unlock(lower_dir_inode);
+ inode_unlock(lower_dir);
if (!rc)
d_drop(dentry);
return rc;
@@ -177,13 +173,13 @@ ecryptfs_do_create(struct inode *directory_inode,
{
int rc;
struct dentry *lower_dentry;
- struct dentry *lower_dir_dentry;
+ struct inode *lower_dir;
struct inode *inode;
- lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
- mode, true);
+ rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
+ if (!rc)
+ rc = vfs_create(&init_user_ns, lower_dir,
+ lower_dentry, mode, true);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
@@ -193,18 +189,17 @@ ecryptfs_do_create(struct inode *directory_inode,
inode = __ecryptfs_get_inode(d_inode(lower_dentry),
directory_inode->i_sb);
if (IS_ERR(inode)) {
- vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
- lower_dentry, NULL);
+ vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL);
goto out_lock;
}
- fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
- fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry));
+ fsstack_copy_attr_times(directory_inode, lower_dir);
+ fsstack_copy_inode_size(directory_inode, lower_dir);
out_lock:
- unlock_dir(lower_dir_dentry);
+ inode_unlock(lower_dir);
return inode;
}
-/**
+/*
* ecryptfs_initialize_file
*
* Cause the file to be changed from a basic empty file to an ecryptfs
@@ -247,10 +242,8 @@ out:
return rc;
}
-/**
+/*
* ecryptfs_create
- * @dir: The inode of the directory in which to create the file.
- * @dentry: The eCryptfs dentry
* @mode: The mode of the new file.
*
* Creates a new file.
@@ -318,7 +311,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
return 0;
}
-/**
+/*
* ecryptfs_lookup_interpose - Dentry interposition for a lookup
*/
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
@@ -431,32 +424,28 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
{
struct dentry *lower_old_dentry;
struct dentry *lower_new_dentry;
- struct dentry *lower_dir_dentry;
+ struct inode *lower_dir;
u64 file_size_save;
int rc;
file_size_save = i_size_read(d_inode(old_dentry));
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
- lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
- dget(lower_old_dentry);
- dget(lower_new_dentry);
- lower_dir_dentry = lock_parent(lower_new_dentry);
- rc = vfs_link(lower_old_dentry, &init_user_ns,
- d_inode(lower_dir_dentry), lower_new_dentry, NULL);
+ rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
+ if (!rc)
+ rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir,
+ lower_new_dentry, NULL);
if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
if (rc)
goto out_lock;
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_attr_times(dir, lower_dir);
+ fsstack_copy_inode_size(dir, lower_dir);
set_nlink(d_inode(old_dentry),
ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
i_size_write(d_inode(new_dentry), file_size_save);
out_lock:
- unlock_dir(lower_dir_dentry);
- dput(lower_new_dentry);
- dput(lower_old_dentry);
+ inode_unlock(lower_dir);
return rc;
}
@@ -471,14 +460,14 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
{
int rc;
struct dentry *lower_dentry;
- struct dentry *lower_dir_dentry;
+ struct inode *lower_dir;
char *encoded_symname;
size_t encoded_symlen;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- dget(lower_dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
+ rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+ if (rc)
+ goto out_lock;
mount_crypt_stat = &ecryptfs_superblock_to_private(
dir->i_sb)->mount_crypt_stat;
rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -487,7 +476,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
strlen(symname));
if (rc)
goto out_lock;
- rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+ rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry,
encoded_symname);
kfree(encoded_symname);
if (rc || d_really_is_negative(lower_dentry))
@@ -495,11 +484,10 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out_lock;
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_attr_times(dir, lower_dir);
+ fsstack_copy_inode_size(dir, lower_dir);
out_lock:
- unlock_dir(lower_dir_dentry);
- dput(lower_dentry);
+ inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -510,22 +498,22 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
{
int rc;
struct dentry *lower_dentry;
- struct dentry *lower_dir_dentry;
+ struct inode *lower_dir;
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
- mode);
+ rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+ if (!rc)
+ rc = vfs_mkdir(&init_user_ns, lower_dir,
+ lower_dentry, mode);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
- set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+ fsstack_copy_attr_times(dir, lower_dir);
+ fsstack_copy_inode_size(dir, lower_dir);
+ set_nlink(dir, lower_dir->i_nlink);
out:
- unlock_dir(lower_dir_dentry);
+ inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -534,29 +522,24 @@ out:
static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct dentry *lower_dentry;
- struct dentry *lower_dir_dentry;
- struct inode *lower_dir_inode;
+ struct inode *lower_dir;
int rc;
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
- lower_dir_inode = d_inode(lower_dir_dentry);
-
- inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ rc = lock_parent(dentry, &lower_dentry, &lower_dir);
dget(lower_dentry); // don't even try to make the lower negative
- if (lower_dentry->d_parent != lower_dir_dentry)
- rc = -EINVAL;
- else if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
+ if (!rc) {
+ if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry);
+ }
if (!rc) {
clear_nlink(d_inode(dentry));
- fsstack_copy_attr_times(dir, lower_dir_inode);
- set_nlink(dir, lower_dir_inode->i_nlink);
+ fsstack_copy_attr_times(dir, lower_dir);
+ set_nlink(dir, lower_dir->i_nlink);
}
dput(lower_dentry);
- inode_unlock(lower_dir_inode);
+ inode_unlock(lower_dir);
if (!rc)
d_drop(dentry);
return rc;
@@ -568,21 +551,21 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
{
int rc;
struct dentry *lower_dentry;
- struct dentry *lower_dir_dentry;
+ struct inode *lower_dir;
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
- mode, dev);
+ rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+ if (!rc)
+ rc = vfs_mknod(&init_user_ns, lower_dir,
+ lower_dentry, mode, dev);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_attr_times(dir, lower_dir);
+ fsstack_copy_inode_size(dir, lower_dir);
out:
- unlock_dir(lower_dir_dentry);
+ inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -888,6 +871,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
/**
* ecryptfs_setattr
+ * @mnt_userns: user namespace of the target mount
* @dentry: dentry handle to the inode to modify
* @ia: Structure with flags of what to change and values
*
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index f6a17d259db7..3fe41964c0d8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
* In-kernel key management code. Includes functions to parse and
* write authentication token-related packets with the underlying
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include "ecryptfs_kernel.h"
-/**
+/*
* request_key returned an error instead of a valid key address;
* determine the type of error, make appropriate log entries, and
* return an error code.
@@ -536,8 +536,9 @@ out:
/**
* ecryptfs_find_auth_tok_for_sig
+ * @auth_tok_key: key containing the authentication token
* @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
+ * @mount_crypt_stat: inode crypt_stat crypto context
* @sig: Sig of auth_tok to find
*
* For now, this function simply looks at the registered auth_tok's
@@ -576,7 +577,7 @@ ecryptfs_find_auth_tok_for_sig(
return rc;
}
-/**
+/*
* write_tag_70_packet can gobble a lot of stack space. We stuff most
* of the function's parameters in a kmalloc'd struct to help reduce
* eCryptfs' overall stack usage.
@@ -604,7 +605,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct shash_desc *hash_desc;
};
-/**
+/*
* write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
* @filename: NULL-terminated filename string
*
@@ -873,7 +874,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
};
/**
- * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * ecryptfs_parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
* @filename: This function kmalloc's the memory for the filename
* @filename_size: This function sets this to the amount of memory
* kmalloc'd for the filename
@@ -1172,7 +1173,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code);
if (rc) {
ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n",
- cipher_code)
+ cipher_code);
goto out;
}
crypt_stat->flags |= ECRYPTFS_KEY_VALID;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index a7c903cb01a0..ae4cb4e2e134 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 2008 International Business Machines Corp.
@@ -108,6 +108,7 @@ void ecryptfs_destroy_kthread(void)
* @lower_file: Result of dentry_open by root on lower dentry
* @lower_dentry: Lower dentry for file to open
* @lower_mnt: Lower vfsmount for file to open
+ * @cred: credential to use for this call
*
* This function gets a r/w file opened against the lower dentry.
*
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cdf40a54a35d..d66bbd2df191 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2003 Erez Zadok
@@ -24,7 +24,7 @@
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
-/**
+/*
* Module parameter that defines the ecryptfs_verbosity level.
*/
int ecryptfs_verbosity = 0;
@@ -34,7 +34,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
"Initial verbosity level (0 or 1; defaults to "
"0, which is Quiet)");
-/**
+/*
* Module parameter that defines the number of message buffer elements
*/
unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
@@ -43,7 +43,7 @@ module_param(ecryptfs_message_buf_len, uint, 0);
MODULE_PARM_DESC(ecryptfs_message_buf_len,
"Number of message buffer elements");
-/**
+/*
* Module parameter that defines the maximum guaranteed amount of time to wait
* for a response from ecryptfsd. The actual sleep time will be, more than
* likely, a small amount greater than this specified value, but only less if
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(ecryptfs_message_wait_timeout,
"sleep while waiting for a message response from "
"userspace");
-/**
+/*
* Module parameter that is an estimate of the maximum number of users
* that will be concurrently using eCryptfs. Set this to the right
* value to balance performance and memory use.
@@ -80,7 +80,7 @@ void __ecryptfs_printk(const char *fmt, ...)
va_end(args);
}
-/**
+/*
* ecryptfs_init_lower_file
* @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
* the lower dentry and the lower mount set
@@ -221,7 +221,7 @@ static void ecryptfs_init_mount_crypt_stat(
/**
* ecryptfs_parse_options
- * @sb: The ecryptfs super block
+ * @sbi: The ecryptfs super block
* @options: The options passed to the kernel
* @check_ruid: set to 1 if device uid should be checked against the ruid
*
@@ -466,10 +466,10 @@ out:
struct kmem_cache *ecryptfs_sb_info_cache;
static struct file_system_type ecryptfs_fs_type;
-/**
- * ecryptfs_get_sb
- * @fs_type
- * @flags
+/*
+ * ecryptfs_mount
+ * @fs_type: The filesystem type that the superblock should belong to
+ * @flags: The flags associated with the mount
* @dev_name: The path to mount over
* @raw_data: The options passed into the kernel
*/
@@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
+ if (!dev_name) {
+ rc = -EINVAL;
+ err = "Device name cannot be null";
+ goto out;
+ }
+
rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
if (rc) {
err = "Error parsing options";
@@ -635,7 +641,7 @@ static struct file_system_type ecryptfs_fs_type = {
};
MODULE_ALIAS_FS("ecryptfs");
-/**
+/*
* inode_info_init_once
*
* Initializes the ecryptfs_inode_info_cache when it is created
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index c0dfd9647627..6318f3500e5c 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 2004-2008 International Business Machines Corp.
@@ -14,10 +14,10 @@
static LIST_HEAD(ecryptfs_msg_ctx_free_list);
static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
-static struct mutex ecryptfs_msg_ctx_lists_mux;
+static DEFINE_MUTEX(ecryptfs_msg_ctx_lists_mux);
static struct hlist_head *ecryptfs_daemon_hash;
-struct mutex ecryptfs_daemon_hash_mux;
+DEFINE_MUTEX(ecryptfs_daemon_hash_mux);
static int ecryptfs_hash_bits;
#define ecryptfs_current_euid_hash(uid) \
hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits)
@@ -147,7 +147,7 @@ out:
return rc;
}
-/**
+/*
* ecryptfs_exorcise_daemon - Destroy the daemon struct
*
* Must be called ceremoniously while in possession of
@@ -181,7 +181,8 @@ out:
}
/**
- * ecryptfs_process_reponse
+ * ecryptfs_process_response
+ * @daemon: eCryptfs daemon object
* @msg: The ecryptfs message received; the caller should sanity check
* msg->data_len and free the memory
* @seq: The sequence number of the message; must match the sequence
@@ -250,6 +251,7 @@ out:
* ecryptfs_send_message_locked
* @data: The data to send
* @data_len: The length of data
+ * @msg_type: Type of message
* @msg_ctx: The message context allocated for the send
*
* Must be called with ecryptfs_daemon_hash_mux held.
@@ -359,7 +361,6 @@ int __init ecryptfs_init_messaging(void)
"too large, defaulting to [%d] users\n", __func__,
ecryptfs_number_of_users);
}
- mutex_init(&ecryptfs_daemon_hash_mux);
mutex_lock(&ecryptfs_daemon_hash_mux);
ecryptfs_hash_bits = 1;
while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
@@ -383,7 +384,6 @@ int __init ecryptfs_init_messaging(void)
rc = -ENOMEM;
goto out;
}
- mutex_init(&ecryptfs_msg_ctx_lists_mux);
mutex_lock(&ecryptfs_msg_ctx_lists_mux);
ecryptfs_msg_counter = 0;
for (i = 0; i < ecryptfs_message_buf_len; i++) {
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 742ece22c1d4..4e62c3cef70f 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 2008 International Business Machines Corp.
@@ -312,6 +312,7 @@ out_unlock_daemon:
/**
* ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @daemon: eCryptfs daemon object
* @data: Bytes comprising struct ecryptfs_message
* @data_size: sizeof(struct ecryptfs_message) + data len
* @seq: Sequence number for miscdev response packet
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2f333a40ff4d..392e721b50a3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
* This is where eCryptfs coordinates the symmetric encryption and
* decryption of the file data as it passes between the lower
@@ -22,7 +22,7 @@
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
-/**
+/*
* ecryptfs_get_locked_page
*
* Get one page from cache or lower f/s, return error otherwise.
@@ -41,6 +41,7 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
/**
* ecryptfs_writepage
* @page: Page that is locked before this call is made
+ * @wbc: Write-back control structure
*
* Returns zero on success; non-zero otherwise
*
@@ -78,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
}
}
-/**
+/*
* Header Extent:
* Octets 0-7: Unencrypted file size (big-endian)
* Octets 8-15: eCryptfs special marker
@@ -229,7 +230,7 @@ out:
return rc;
}
-/**
+/*
* Called with lower inode mutex held.
*/
static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
@@ -368,7 +369,7 @@ out:
return rc;
}
-/**
+/*
* ecryptfs_write_inode_size_to_header
*
* Writes the lower file size to the first 8 bytes of the header.
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0438997ac9d8..60bdcaddcbe5 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 2007 International Business Machines Corp.
@@ -230,6 +230,8 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
* ecryptfs_read_lower_page_segment
* @page_for_ecryptfs: The page into which data for eCryptfs will be
* written
+ * @page_index: Page index in @page_for_ecryptfs from which to start
+ * writing
* @offset_in_page: Offset in @page_for_ecryptfs from which to start
* writing
* @size: The number of bytes to write into @page_for_ecryptfs
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 6b1853f1c06a..39116af0390f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* eCryptfs: Linux filesystem encryption layer
*
* Copyright (C) 1997-2003 Erez Zadok
@@ -81,7 +81,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
/**
* ecryptfs_statfs
- * @sb: The ecryptfs super block
+ * @dentry: The ecryptfs dentry
* @buf: The struct kstatfs to fill in with stats
*
* Get the filesystem statistics. Currently, we let this pass right through
@@ -108,7 +108,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/**
* ecryptfs_evict_inode
- * @inode - The ecryptfs inode
+ * @inode: The ecryptfs inode
*
* Called by iput() when the inode reference count reached zero
* and the inode is not hashed anywhere. Used to clear anything
@@ -123,7 +123,7 @@ static void ecryptfs_evict_inode(struct inode *inode)
iput(ecryptfs_inode_to_lower(inode));
}
-/**
+/*
* ecryptfs_show_options
*
* Prints the mount options for a given superblock.
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index e62d813756f2..efaf32596b97 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -450,14 +450,31 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
lcn = m->lcn + 1;
if (m->compressedlcs)
goto out;
- if (lcn == initial_lcn)
- goto err_bonus_cblkcnt;
err = z_erofs_load_cluster_from_disk(m, lcn);
if (err)
return err;
+ /*
+ * If the 1st NONHEAD lcluster has already been handled initially w/o
+ * valid compressedlcs, which means at least it mustn't be CBLKCNT, or
+ * an internal implemenatation error is detected.
+ *
+ * The following code can also handle it properly anyway, but let's
+ * BUG_ON in the debugging mode only for developers to notice that.
+ */
+ DBG_BUGON(lcn == initial_lcn &&
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD);
+
switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
+ * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ */
+ m->compressedlcs = 1;
+ break;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
if (m->delta[0] != 1)
goto err_bonus_cblkcnt;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 73138ea68342..1e596e1d0bba 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep,
*/
list_splice(txlist, &ep->rdllist);
__pm_relax(ep->ws);
+
+ if (!list_empty(&ep->rdllist)) {
+ if (waitqueue_active(&ep->wq))
+ wake_up(&ep->wq);
+ }
+
write_unlock_irq(&ep->lock);
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c6b8bba73031..1f69b81655b6 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -81,11 +81,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns
struct dentry *ext2_get_parent(struct dentry *child)
{
- struct qstr dotdot = QSTR_INIT("..", 2);
ino_t ino;
int res;
- res = ext2_inode_by_name(d_inode(child), &dotdot, &ino);
+ res = ext2_inode_by_name(d_inode(child), &dotdot_name, &ino);
if (res)
return ERR_PTR(res);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 74a5172c2d83..9dc6e74b265c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -239,7 +239,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- return num_clusters_in_group(sb, block_group) -
+ return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5ed870614c8d..ffb295aa891c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode)
return 0;
}
+static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
+{
+ /* Check if . or .. , or skip if namelen is 0 */
+ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
+ (de->name[1] == '.' || de->name[1] == '\0'))
+ return true;
+ /* Check if this is a csum entry */
+ if (de->file_type == EXT4_FT_DIR_CSUM)
+ return true;
+ return false;
+}
+
/*
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
@@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
+ bool fake = is_fake_dir_entry(de);
+ bool has_csum = ext4_has_metadata_csum(dir->i_sb);
- if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+ if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+ else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
+ fake ? NULL : dir)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
- else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
+ has_csum ? NULL : dir) &&
next_offset != size))
error_msg = "directory entry too close to block end";
else if (unlikely(le32_to_cpu(de->inode) >
@@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
if (filp)
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
return 1;
}
@@ -124,9 +140,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
- if (err != ERR_BAD_DX_DIR) {
+ if (err != ERR_BAD_DX_DIR)
return err;
- }
+
/* Can we just clear INDEX flag to ignore htree information? */
if (!ext4_has_metadata_csum(sb)) {
/*
@@ -224,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
- sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+ sb->s_blocksize) < ext4_dir_rec_len(1,
+ inode))
break;
i += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
@@ -265,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
/* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode,
- 0, 0, &de_name, &fstr);
+ EXT4_DIRENT_HASH(de),
+ EXT4_DIRENT_MINOR_HASH(de),
+ &de_name, &fstr);
de_name = fstr;
fstr.len = save_len;
if (err)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 18f021c988a1..37002663d521 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -162,7 +162,12 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-
+/* Large fragment size list lookup succeeded at least once for cr = 0 */
+#define EXT4_MB_CR0_OPTIMIZED 0x8000
+/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+#define EXT4_MB_CR1_OPTIMIZED 0x00010000
+/* Perform linear traversal for one group */
+#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -1213,7 +1218,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
-#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
+#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1238,7 +1243,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
-
+#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
+ * scanning in mballoc
+ */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1519,9 +1526,14 @@ struct ext4_sb_info {
unsigned int s_mb_free_pending;
struct list_head s_freed_data_list; /* List of blocks to be freed
after commit completed */
+ struct rb_root s_mb_avg_fragment_size_root;
+ rwlock_t s_mb_rb_lock;
+ struct list_head *s_mb_largest_free_orders;
+ rwlock_t *s_mb_largest_free_orders_locks;
/* tunables */
unsigned long s_stripe;
+ unsigned int s_mb_max_linear_groups;
unsigned int s_mb_stream_request;
unsigned int s_mb_max_to_scan;
unsigned int s_mb_min_to_scan;
@@ -1541,12 +1553,17 @@ struct ext4_sb_info {
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- spinlock_t s_bal_lock;
- unsigned long s_mb_buddies_generated;
- unsigned long long s_mb_generation_time;
+ atomic_t s_bal_cr0_bad_suggestions;
+ atomic_t s_bal_cr1_bad_suggestions;
+ atomic64_t s_bal_cX_groups_considered[4];
+ atomic64_t s_bal_cX_hits[4];
+ atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
+ atomic_t s_mb_buddies_generated; /* number of buddies generated */
+ atomic64_t s_mb_generation_time;
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
@@ -2187,6 +2204,17 @@ struct ext4_dir_entry {
char name[EXT4_NAME_LEN]; /* File name */
};
+
+/*
+ * Encrypted Casefolded entries require saving the hash on disk. This structure
+ * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
+ * boundary.
+ */
+struct ext4_dir_entry_hash {
+ __le32 hash;
+ __le32 minor_hash;
+};
+
/*
* The new version of the directory entry. Since EXT4 structures are
* stored in intel byte order, and the name_len field could never be
@@ -2202,6 +2230,22 @@ struct ext4_dir_entry_2 {
};
/*
+ * Access the hashes at the end of ext4_dir_entry_2
+ */
+#define EXT4_DIRENT_HASHES(entry) \
+ ((struct ext4_dir_entry_hash *) \
+ (((void *)(entry)) + \
+ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
+#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
+#define EXT4_DIRENT_MINOR_HASH(entry) \
+ le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)
+
+static inline bool ext4_hash_in_dirent(const struct inode *inode)
+{
+ return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
+}
+
+/*
* This is a bogus directory entry at the end of each leaf block that
* records checksums.
*/
@@ -2242,11 +2286,25 @@ struct ext4_dir_entry_tail {
*/
#define EXT4_DIR_PAD 4
#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
- ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
/*
+ * The rec_len is dependent on the type of directory. Directories that are
+ * casefolded and encrypted need to store the hash as well, so we add room for
+ * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
+ * pass NULL for dir, as those entries do not use the extra fields.
+ */
+static inline unsigned int ext4_dir_rec_len(__u8 name_len,
+ const struct inode *dir)
+{
+ int rec_len = (name_len + 8 + EXT4_DIR_ROUND);
+
+ if (dir && ext4_hash_in_dirent(dir))
+ rec_len += sizeof(struct ext4_dir_entry_hash);
+ return (rec_len & ~EXT4_DIR_ROUND);
+}
+
+/*
* If we ever get support for fs block sizes > page_size, we'll need
* to remove the #if statements in the next two functions...
*/
@@ -2302,6 +2360,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_LEGACY_UNSIGNED 3
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
+#define DX_HASH_SIPHASH 6
static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
const void *address, unsigned int length)
@@ -2356,6 +2415,7 @@ struct ext4_filename {
};
#define fname_name(p) ((p)->disk_name.name)
+#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p) ((p)->disk_name.len)
/*
@@ -2586,9 +2646,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
#ifdef CONFIG_UNICODE
-extern void ext4_fname_setup_ci_filename(struct inode *dir,
+extern int ext4_fname_setup_ci_filename(struct inode *dir,
const struct qstr *iname,
- struct fscrypt_str *fname);
+ struct ext4_filename *fname);
#endif
#ifdef CONFIG_FS_ENCRYPTION
@@ -2619,9 +2679,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
- return 0;
+ return err;
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2638,9 +2698,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
+ err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
#endif
- return 0;
+ return err;
}
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
@@ -2665,15 +2725,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
int lookup,
struct ext4_filename *fname)
{
+ int err = 0;
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
- return 0;
+ return err;
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2698,9 +2759,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct ext4_dir_entry_2 *,
struct buffer_head *, char *, int,
unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
- (de), (bh), (buf), (size), (offset)))
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent,
@@ -2711,7 +2772,7 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname);
@@ -2802,8 +2863,10 @@ int __init ext4_fc_init_dentry_cache(void);
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
+extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
+extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
@@ -3306,11 +3369,14 @@ struct ext4_group_info {
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
+ struct rb_node bb_avg_fragment_size_rb;
+ struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3513,9 +3579,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int ext4_ci_compare(const struct inode *parent,
- const struct qstr *fname,
- const struct qstr *entry, bool quick);
extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
struct inode *inode);
extern int __ext4_link(struct inode *dir, struct inode *inode,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 7541d0b5d706..f98ca4f37ef6 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -66,7 +66,7 @@
* Fast Commit Ineligibility
* -------------------------
* Not all operations are supported by fast commits today (e.g extended
- * attributes). Fast commit ineligiblity is marked by calling one of the
+ * attributes). Fast commit ineligibility is marked by calling one of the
* two following functions:
*
* - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
@@ -1088,8 +1088,10 @@ static int ext4_fc_perform_commit(journal_t *journal)
head.fc_tid = cpu_to_le32(
sbi->s_journal->j_running_transaction->t_tid);
if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
- (u8 *)&head, &crc))
+ (u8 *)&head, &crc)) {
+ ret = -ENOSPC;
goto out;
+ }
}
spin_lock(&sbi->s_fc_lock);
@@ -1734,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
}
/* Range is mapped and needs a state change */
- jbd_debug(1, "Converting from %d to %d %lld",
+ jbd_debug(1, "Converting from %ld to %d %lld",
map.m_flags & EXT4_MAP_UNWRITTEN,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5332dd3ea7e2..816dedcbd541 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -371,15 +371,32 @@ truncate:
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
int error, unsigned int flags)
{
- loff_t offset = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
if (error)
return error;
- if (size && flags & IOMAP_DIO_UNWRITTEN)
- return ext4_convert_unwritten_extents(NULL, inode,
- offset, size);
+ if (size && flags & IOMAP_DIO_UNWRITTEN) {
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
+ if (error < 0)
+ return error;
+ }
+ /*
+ * If we are extending the file, we have to update i_size here before
+ * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
+ * buffered reads could zero out too much from page cache pages. Update
+ * of on-disk size will happen later in ext4_dio_write_iter() where
+ * we have enough information to also perform orphan list handling etc.
+ * Note that we perform all extending writes synchronously under
+ * i_rwsem held exclusively so i_size update is safe here in that case.
+ * If the write was not extending, we cannot see pos > i_size here
+ * because operations reducing i_size like truncate wait for all
+ * outstanding DIO before updating i_size.
+ */
+ pos += size;
+ if (pos > i_size_read(inode))
+ i_size_write(inode, pos);
return 0;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index a92eb79de0cc..f34f4176c1e7 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
-static int __ext4fs_dirhash(const char *name, int len,
+static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
__u32 hash;
@@ -259,6 +259,22 @@ static int __ext4fs_dirhash(const char *name, int len,
hash = buf[0];
minor_hash = buf[1];
break;
+ case DX_HASH_SIPHASH:
+ {
+ struct qstr qname = QSTR_INIT(name, len);
+ __u64 combined_hash;
+
+ if (fscrypt_has_encryption_key(dir)) {
+ combined_hash = fscrypt_fname_siphash(dir, &qname);
+ } else {
+ ext4_warning_inode(dir, "Siphash requires key");
+ return -1;
+ }
+
+ hash = (__u32)(combined_hash >> 32);
+ minor_hash = (__u32)combined_hash;
+ break;
+ }
default:
hinfo->hash = 0;
return -1;
@@ -280,7 +296,8 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir) && um) {
+ if (len && IS_CASEFOLDED(dir) && um &&
+ (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
@@ -291,12 +308,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
goto opaque_seq;
}
- r = __ext4fs_dirhash(buff, dlen, hinfo);
+ r = __ext4fs_dirhash(dir, buff, dlen, hinfo);
kfree(buff);
return r;
}
opaque_seq:
#endif
- return __ext4fs_dirhash(name, len, hinfo);
+ return __ext4fs_dirhash(dir, name, len, hinfo);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 755a68bb7e22..81a17a3cd80e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1292,7 +1292,8 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
- if (ext4_has_feature_inline_data(sb))
+ if (ext4_has_feature_inline_data(sb) &&
+ (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1513,6 +1514,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
handle_t *handle;
ext4_fsblk_t blk;
int num, ret = 0, used_blks = 0;
+ unsigned long used_inos = 0;
/* This should not happen, but just to be sure check this */
if (sb_rdonly(sb)) {
@@ -1543,22 +1545,37 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
* used inodes so we need to skip blocks with used inodes in
* inode table.
*/
- if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
- used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)),
- sbi->s_inodes_per_block);
-
- if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) ||
- ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)) <
- EXT4_FIRST_INO(sb)))) {
- ext4_error(sb, "Something is wrong with group %u: "
- "used itable blocks: %d; "
- "itable unused count: %u",
- group, used_blks,
- ext4_itable_unused_count(sb, gdp));
- ret = 1;
- goto err_out;
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
+ used_inos = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
+
+ /* Bogus inode unused count? */
+ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ used_inos += group * EXT4_INODES_PER_GROUP(sb);
+ /*
+ * Are there some uninitialized inodes in the inode table
+ * before the first normal inode?
+ */
+ if ((used_blks != sbi->s_itb_per_group) &&
+ (used_inos < EXT4_FIRST_INO(sb))) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "itable unused count: %u; "
+ "itables initialized count: %ld",
+ group, ext4_itable_unused_count(sb, gdp),
+ used_inos);
+ ret = 1;
+ goto err_out;
+ }
}
blk = ext4_inode_table(sb, gdp) + used_blks;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 1223a18c3ff9..a7bc6ad656a9 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -705,7 +705,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
/*
* Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
+ * be able to restart the transaction at a convenient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index b41512d1badc..3cf01629010d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -795,7 +795,7 @@ ext4_journalled_write_inline_data(struct inode *inode,
* clear the inode state safely.
* 2. The inode has inline data, then we need to read the data, make it
* update and dirty so that ext4_da_writepages can handle it. We don't
- * need to start the journal since the file's metatdata isn't changed now.
+ * need to start the journal since the file's metadata isn't changed now.
*/
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
@@ -1031,7 +1031,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(inode, de, inline_size, fname);
+ ext4_insert_dentry(dir, inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1100,7 +1100,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
int new_size = get_max_inline_xattr_value_size(dir, iloc);
- if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
return -ENOSPC;
ret = ext4_update_inline_data(handle, dir,
@@ -1380,8 +1380,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 1;
strcpy(fake.name, ".");
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_OFFSET;
@@ -1390,8 +1390,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 2;
strcpy(fake.name, "..");
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_SIZE;
@@ -1406,7 +1406,12 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
}
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1488,8 +1493,8 @@ int ext4_read_inline_dir(struct file *file,
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
- dotdot_offset = EXT4_DIR_REC_LEN(1);
- dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ dotdot_offset = ext4_dir_rec_len(1, NULL);
+ dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
@@ -1524,7 +1529,7 @@ int ext4_read_inline_dir(struct file *file,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len, extra_size)
- < EXT4_DIR_REC_LEN(1))
+ < ext4_dir_rec_len(1, NULL))
break;
i += ext4_rec_len_from_disk(de->rec_len,
extra_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0948a43f1b3d..fe6045a46599 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1066,8 +1066,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
}
continue;
}
@@ -1092,8 +1091,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
}
if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
@@ -3824,7 +3822,7 @@ unlock:
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
+ * that corresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e9b0a1fa2ba8..31627f7dc5cd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -316,6 +316,12 @@ static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
static bool dax_compatible(struct inode *inode, unsigned int oldflags,
unsigned int flags)
{
+ /* Allow the DAX flag to be changed on inline directories */
+ if (S_ISDIR(inode->i_mode)) {
+ flags &= ~EXT4_INLINE_DATA_FL;
+ oldflags &= ~EXT4_INLINE_DATA_FL;
+ }
+
if (flags & EXT4_DAX_FL) {
if ((oldflags & EXT4_DAX_MUT_EXCL) ||
ext4_test_inode_state(inode,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a02fadf4fc84..3239e6669e84 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -127,11 +127,50 @@
* smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
+ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
+ * structures in two data structures:
+ *
+ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ *
+ * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ *
+ * This is an array of lists where the index in the array represents the
+ * largest free order in the buddy bitmap of the participating group infos of
+ * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of lists. Group-infos are
+ * placed in appropriate lists.
+ *
+ * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
+ *
+ * Locking: sbi->s_mb_rb_lock (rwlock)
+ *
+ * This is a red black tree consisting of group infos and the tree is sorted
+ * by average fragment sizes (which is calculated as ext4_group_info->bb_free
+ * / ext4_group_info->bb_fragments).
+ *
+ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
+ * structures to decide the order in which groups are to be traversed for
+ * fulfilling an allocation request.
+ *
+ * At CR = 0, we look for groups which have the largest_free_order >= the order
+ * of the request. We directly look at the largest free order list in the data
+ * structure (1) above where largest_free_order = order of the request. If that
+ * list is empty, we look at remaining list in the increasing order of
+ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
+ *
+ * At CR = 1, we only consider groups where average fragment size > request
+ * size. So, we lookup a group which has average fragment size just above or
+ * equal to request size using our rb tree (data structure 2) in O(log N) time.
+ *
+ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
+ * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+ *
* The regular allocator (using the buddy cache) supports a few tunables.
*
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
+ * /sys/fs/ext4/<partition>/mb_linear_limit
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -149,6 +188,16 @@
* can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
+ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
+ * get traversed linearly. That may result in subsequent allocations being not
+ * close to each other. And so, the underlying device may get filled up in a
+ * non-linear fashion. While that may not matter on non-rotational devices, for
+ * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * tells mballoc how many groups mballoc should search linearly before
+ * performing consulting above data structures for more efficient lookups. For
+ * non rotational devices, this value defaults to 0 and for rotational devices
+ * this is set to MB_DEFAULT_LINEAR_LIMIT.
+ *
* Both the prealloc space are getting populated as above. So for the first
* request we will hit the buddy cache which will result in this prealloc
* space getting filled. The prealloc space is then later used for the
@@ -299,6 +348,8 @@
* - bitlock on a group (group)
* - object (inode/locality) (object)
* - per-pa lock (pa)
+ * - cr0 lists lock (cr0)
+ * - cr1 tree lock (cr1)
*
* Paths:
* - new pa
@@ -328,6 +379,9 @@
* group
* object
*
+ * - allocation path (ext4_mb_regular_allocator)
+ * group
+ * cr0/cr1
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
@@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr);
+
/*
* The algorithm using this percpu seq counter goes below:
* 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
+ int (*cmp)(struct rb_node *, struct rb_node *))
+{
+ struct rb_node **iter = &root->rb_node, *parent = NULL;
+
+ while (*iter) {
+ parent = *iter;
+ if (cmp(new, *iter) > 0)
+ iter = &((*iter)->rb_left);
+ else
+ iter = &((*iter)->rb_right);
+ }
+
+ rb_link_node(new, parent, iter);
+ rb_insert_color(new, root);
+}
+
+static int
+ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
+{
+ struct ext4_group_info *grp1 = rb_entry(rb1,
+ struct ext4_group_info,
+ bb_avg_fragment_size_rb);
+ struct ext4_group_info *grp2 = rb_entry(rb2,
+ struct ext4_group_info,
+ bb_avg_fragment_size_rb);
+ int num_frags_1, num_frags_2;
+
+ num_frags_1 = grp1->bb_fragments ?
+ grp1->bb_free / grp1->bb_fragments : 0;
+ num_frags_2 = grp2->bb_fragments ?
+ grp2->bb_free / grp2->bb_fragments : 0;
+
+ return (num_frags_2 - num_frags_1);
+}
+
+/*
+ * Reinsert grpinfo into the avg_fragment_size tree with new average
+ * fragment size.
+ */
+static void
+mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+ return;
+
+ write_lock(&sbi->s_mb_rb_lock);
+ if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
+ rb_erase(&grp->bb_avg_fragment_size_rb,
+ &sbi->s_mb_avg_fragment_size_root);
+ RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
+ }
+
+ ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
+ &grp->bb_avg_fragment_size_rb,
+ ext4_mb_avg_fragment_size_cmp);
+ write_unlock(&sbi->s_mb_rb_lock);
+}
+
+/*
+ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
+ * cr level needs an update.
+ */
+static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *iter, *grp;
+ int i;
+
+ if (ac->ac_status == AC_STATUS_FOUND)
+ return;
+
+ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
+ atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
+
+ grp = NULL;
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ if (list_empty(&sbi->s_mb_largest_free_orders[i]))
+ continue;
+ read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
+ if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ continue;
+ }
+ grp = NULL;
+ list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
+ bb_largest_free_order_node) {
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
+ grp = iter;
+ break;
+ }
+ }
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ if (grp)
+ break;
+ }
+
+ if (!grp) {
+ /* Increment cr and search again */
+ *new_cr = 1;
+ } else {
+ *group = grp->bb_group;
+ ac->ac_last_optimal_group = *group;
+ ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+ }
+}
+
+/*
+ * Choose next group by traversing average fragment size tree. Updates *new_cr
+ * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
+ * the linear search should continue for one iteration since there's lock
+ * contention on the rb tree lock.
+ */
+static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int avg_fragment_size, best_so_far;
+ struct rb_node *node, *found;
+ struct ext4_group_info *grp;
+
+ /*
+ * If there is contention on the lock, instead of waiting for the lock
+ * to become available, just continue searching lineraly. We'll resume
+ * our rb tree search later starting at ac->ac_last_optimal_group.
+ */
+ if (!read_trylock(&sbi->s_mb_rb_lock)) {
+ ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
+ return;
+ }
+
+ if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+ if (sbi->s_mb_stats)
+ atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+ /* We have found something at CR 1 in the past */
+ grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
+ for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
+ found = rb_next(found)) {
+ grp = rb_entry(found, struct ext4_group_info,
+ bb_avg_fragment_size_rb);
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
+ if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
+ break;
+ }
+ goto done;
+ }
+
+ node = sbi->s_mb_avg_fragment_size_root.rb_node;
+ best_so_far = 0;
+ found = NULL;
+
+ while (node) {
+ grp = rb_entry(node, struct ext4_group_info,
+ bb_avg_fragment_size_rb);
+ avg_fragment_size = 0;
+ if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
+ avg_fragment_size = grp->bb_fragments ?
+ grp->bb_free / grp->bb_fragments : 0;
+ if (!best_so_far || avg_fragment_size < best_so_far) {
+ best_so_far = avg_fragment_size;
+ found = node;
+ }
+ }
+ if (avg_fragment_size > ac->ac_g_ex.fe_len)
+ node = node->rb_right;
+ else
+ node = node->rb_left;
+ }
+
+done:
+ if (found) {
+ grp = rb_entry(found, struct ext4_group_info,
+ bb_avg_fragment_size_rb);
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+ } else {
+ *new_cr = 2;
+ }
+
+ read_unlock(&sbi->s_mb_rb_lock);
+ ac->ac_last_optimal_group = *group;
+}
+
+static inline int should_optimize_scan(struct ext4_allocation_context *ac)
+{
+ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
+ return 0;
+ if (ac->ac_criteria >= 2)
+ return 0;
+ if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ return 0;
+ return 1;
+}
+
+/*
+ * Return next linear group for allocation. If linear traversal should not be
+ * performed, this function just returns the same group
+ */
+static int
+next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
+{
+ if (!should_optimize_scan(ac))
+ goto inc_and_return;
+
+ if (ac->ac_groups_linear_remaining) {
+ ac->ac_groups_linear_remaining--;
+ goto inc_and_return;
+ }
+
+ if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
+ ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
+ goto inc_and_return;
+ }
+
+ return group;
+inc_and_return:
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ return group + 1 >= ngroups ? 0 : group + 1;
+}
+
+/*
+ * ext4_mb_choose_next_group: choose next group for allocation.
+ *
+ * @ac Allocation Context
+ * @new_cr This is an output parameter. If the there is no good group
+ * available at current CR level, this field is updated to indicate
+ * the new cr level that should be used.
+ * @group This is an input / output parameter. As an input it indicates the
+ * next group that the allocator intends to use for allocation. As
+ * output, this field indicates the next group that should be used as
+ * determined by the optimization functions.
+ * @ngroups Total number of groups
+ */
+static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ *new_cr = ac->ac_criteria;
+
+ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
+ return;
+
+ if (*new_cr == 0) {
+ ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
+ } else if (*new_cr == 1) {
+ ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+ } else {
+ /*
+ * TODO: For CR=2, we can arrange groups in an rb tree sorted by
+ * bb_free. But until that happens, we should never come here.
+ */
+ WARN_ON(1);
+ }
+}
+
/*
* Cache the order of the largest free extent we have available in this block
* group.
@@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
- int bits;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ list_del_init(&grp->bb_largest_free_order_node);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ }
grp->bb_largest_free_order = -1; /* uninit */
- bits = sb->s_blocksize_bits + 1;
- for (i = bits; i >= 0; i--) {
+ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
if (grp->bb_counters[i] > 0) {
grp->bb_largest_free_order = i;
break;
}
}
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
+ grp->bb_largest_free_order >= 0 && grp->bb_free) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ list_add_tail(&grp->bb_largest_free_order_node,
+ &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ }
}
static noinline_for_stack
@@ -816,10 +1151,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
- spin_lock(&sbi->s_bal_lock);
- sbi->s_mb_buddies_generated++;
- sbi->s_mb_generation_time += period;
- spin_unlock(&sbi->s_bal_lock);
+ atomic_inc(&sbi->s_mb_buddies_generated);
+ atomic64_add(period, &sbi->s_mb_generation_time);
+ mb_update_avg_fragment_size(sb, grp);
}
/* The buddy information is attached the buddy cache inode
@@ -959,7 +1293,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
- (sb->s_blocksize_bits+2));
+ (MB_NUM_ORDERS(sb)));
/*
* incore got set to the group block bitmap below
*/
@@ -1519,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
done:
mb_set_largest_free_order(sb, e4b->bd_info);
+ mb_update_avg_fragment_size(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1655,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
@@ -1930,7 +2266,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
int max;
BUG_ON(ac->ac_2order <= 0);
- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
if (grp->bb_counters[i] == 0)
continue;
@@ -2109,7 +2445,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
if (free < ac->ac_g_ex.fe_len)
return false;
- if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
+ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
return true;
if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2148,6 +2484,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_grpblk_t free;
int ret = 0;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
if (should_lock)
ext4_lock_group(sb, group);
free = grp->bb_free;
@@ -2315,13 +2653,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* We also support searching for power-of-two requests only for
* requests upto maximum buddy size we have constructed.
*/
- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
+ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
/*
* This should tell if fe_len is exactly power of 2
*/
if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
ac->ac_2order = array_index_nospec(i - 1,
- sb->s_blocksize_bits + 2);
+ MB_NUM_ORDERS(sb));
}
/* if stream allocation is enabled, use global goal */
@@ -2347,17 +2685,21 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ ac->ac_last_optimal_group = group;
+ ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
- for (i = 0; i < ngroups; group++, i++) {
- int ret = 0;
+ for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
+ i++) {
+ int ret = 0, new_cr;
+
cond_resched();
- /*
- * Artificially restricted ngroups for non-extent
- * files makes group > ngroups possible on first loop.
- */
- if (group >= ngroups)
- group = 0;
+
+ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
+ if (new_cr != cr) {
+ cr = new_cr;
+ goto repeat;
+ }
/*
* Batch reads of the block allocation bitmaps
@@ -2422,6 +2764,9 @@ repeat:
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2451,6 +2796,9 @@ repeat:
goto repeat;
}
}
+
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
@@ -2550,6 +2898,157 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
.show = ext4_mb_seq_groups_show,
};
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = (struct super_block *)seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ seq_puts(seq, "mballoc:\n");
+ if (!sbi->s_mb_stats) {
+ seq_puts(seq, "\tmb stats collection turned off.\n");
+ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ return 0;
+ }
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
+
+ seq_puts(seq, "\tcr0_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[0]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_cr0_bad_suggestions));
+
+ seq_puts(seq, "\tcr1_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[1]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_cr1_bad_suggestions));
+
+ seq_puts(seq, "\tcr2_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[2]));
+
+ seq_puts(seq, "\tcr3_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[3]));
+ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ ext4_get_groups_count(sb));
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
+ atomic64_read(&sbi->s_mb_generation_time));
+ seq_printf(seq, "\tpreallocated: %u\n",
+ atomic_read(&sbi->s_mb_preallocated));
+ seq_printf(seq, "\tdiscarded: %u\n",
+ atomic_read(&sbi->s_mb_discarded));
+ return 0;
+}
+
+static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
+{
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ unsigned long position;
+
+ read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
+
+ if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ unsigned long position;
+
+ ++*pos;
+ if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long position = ((unsigned long) v);
+ struct ext4_group_info *grp;
+ struct rb_node *n;
+ unsigned int count, min, max;
+
+ position--;
+ if (position >= MB_NUM_ORDERS(sb)) {
+ seq_puts(seq, "fragment_size_tree:\n");
+ n = rb_first(&sbi->s_mb_avg_fragment_size_root);
+ if (!n) {
+ seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
+ return 0;
+ }
+ grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+ min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+ count = 1;
+ while (rb_next(n)) {
+ count++;
+ n = rb_next(n);
+ }
+ grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+ max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+
+ seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
+ min, max, count);
+ return 0;
+ }
+
+ if (position == 0) {
+ seq_printf(seq, "optimize_scan: %d\n",
+ test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
+ seq_puts(seq, "max_free_order_lists:\n");
+ }
+ count = 0;
+ list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
+ bb_largest_free_order_node)
+ count++;
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
+
+ return 0;
+}
+
+static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
+
+ read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
+}
+
+const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+ .start = ext4_mb_seq_structs_summary_start,
+ .next = ext4_mb_seq_structs_summary_next,
+ .stop = ext4_mb_seq_structs_summary_stop,
+ .show = ext4_mb_seq_structs_summary_show,
+};
+
static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
@@ -2590,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
if (old_groupinfo)
ext4_kvfree_array_rcu(old_groupinfo);
- ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
}
@@ -2652,7 +3151,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
+ RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+ meta_group_info[i]->bb_group = group;
mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
return 0;
@@ -2813,7 +3315,7 @@ int ext4_mb_init(struct super_block *sb)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
@@ -2821,7 +3323,7 @@ int ext4_mb_init(struct super_block *sb)
goto out;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
ret = -ENOMEM;
@@ -2847,10 +3349,30 @@ int ext4_mb_init(struct super_block *sb)
offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
- } while (i <= sb->s_blocksize_bits + 1);
+ } while (i < MB_NUM_ORDERS(sb));
+
+ sbi->s_mb_avg_fragment_size_root = RB_ROOT;
+ sbi->s_mb_largest_free_orders =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!sbi->s_mb_largest_free_orders) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sbi->s_mb_largest_free_orders_locks =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!sbi->s_mb_largest_free_orders_locks) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+ INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
+ rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
+ }
+ rwlock_init(&sbi->s_mb_rb_lock);
spin_lock_init(&sbi->s_md_lock);
- spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_free_pending = 0;
INIT_LIST_HEAD(&sbi->s_freed_data_list);
@@ -2901,6 +3423,10 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+ sbi->s_mb_max_linear_groups = 0;
+ else
+ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
if (ret != 0)
@@ -2912,6 +3438,8 @@ out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
out:
+ kfree(sbi->s_mb_largest_free_orders);
+ kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -2968,6 +3496,8 @@ int ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
+ kfree(sbi->s_mb_largest_free_orders);
+ kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -2978,17 +3508,18 @@ int ext4_mb_release(struct super_block *sb)
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
ext4_msg(sb, KERN_INFO,
- "mballoc: %u extents scanned, %u goal hits, "
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
"%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
+ atomic_read(&sbi->s_bal_groups_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
ext4_msg(sb, KERN_INFO,
- "mballoc: %lu generated and it took %Lu",
- sbi->s_mb_buddies_generated,
- sbi->s_mb_generation_time);
+ "mballoc: %u generated and it took %llu",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ atomic64_read(&sbi->s_mb_generation_time));
ext4_msg(sb, KERN_INFO,
"mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
@@ -3583,12 +4114,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index e75b4749aa1c..39da92ceabf8 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -59,7 +59,7 @@
* by the stream allocator, which purpose is to pack requests
* as close each to other as possible to produce smooth I/O traffic
* We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ * We can tune the same via /proc/fs/ext4/<partition>/stream_req
*/
#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
@@ -78,6 +78,23 @@
*/
#define MB_DEFAULT_MAX_INODE_PREALLOC 512
+/*
+ * Number of groups to search linearly before performing group scanning
+ * optimization.
+ */
+#define MB_DEFAULT_LINEAR_LIMIT 4
+
+/*
+ * Minimum number of groups that should be present in the file system to perform
+ * group scanning optimizations.
+ */
+#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
+
+/*
+ * Number of valid buddy orders
+ */
+#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
+
struct ext4_free_data {
/* this links the free block information from sb_info */
struct list_head efd_list;
@@ -161,11 +178,14 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
+ ext4_group_t ac_last_optimal_group;
+ __u32 ac_groups_considered;
+ __u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
+ __u16 ac_groups_linear_remaining;
__u16 ac_found;
__u16 ac_tail;
__u16 ac_buddy;
- __u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index c5e3fc998211..7e0b4f81c6c0 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -32,7 +32,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- /* Locking only for convinience since we are operating on temp inode */
+ /* Locking only for convenience since we are operating on temp inode */
down_write(&EXT4_I(inode)->i_data_sem);
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
@@ -43,8 +43,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
/*
* Calculate the credit needed to inserting this extent
- * Since we are doing this in loop we may accumalate extra
- * credit. But below we try to not accumalate too much
+ * Since we are doing this in loop we may accumulate extra
+ * credit. But below we try to not accumulate too much
* of them by restarting the journal.
*/
needed = ext4_ext_calc_credits_for_single_extent(inode,
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 795c3ff2907c..68fbeedd627b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -56,7 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
- return 1;
+ return -EIO;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a37a19fabee4..afb9d05a99ba 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -280,9 +280,11 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
unsigned blocksize, struct dx_hash_info *hinfo,
struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
- struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
+ char *to, struct dx_map_entry *offsets,
+ int count, unsigned int blocksize);
+static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
+ unsigned int blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -574,8 +576,9 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
- EXT4_DIR_REC_LEN(2) - infosize;
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(1, NULL) -
+ ext4_dir_rec_len(2, NULL) - infosize;
if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -584,7 +587,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
static inline unsigned dx_node_limit(struct inode *dir)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(0, dir);
if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -673,7 +677,10 @@ static struct stats dx_show_leaf(struct inode *dir,
name = fname_crypto_str.name;
len = fname_crypto_str.len;
}
- ext4fs_dirhash(dir, de->name,
+ if (IS_CASEFOLDED(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else
+ ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
@@ -689,7 +696,7 @@ static struct stats dx_show_leaf(struct inode *dir,
(unsigned) ((char *) de - base));
#endif
}
- space += EXT4_DIR_REC_LEN(de->name_len);
+ space += ext4_dir_rec_len(de->name_len, dir);
names++;
}
de = ext4_next_entry(de, size);
@@ -784,18 +791,34 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
root = (struct dx_root *) frame->bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY) {
+ root->info.hash_version != DX_HASH_LEGACY &&
+ root->info.hash_version != DX_HASH_SIPHASH) {
ext4_warning_inode(dir, "Unrecognised inode hash code %u",
root->info.hash_version);
goto fail;
}
+ if (ext4_hash_in_dirent(dir)) {
+ if (root->info.hash_version != DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash in dirent, but hash is not SIPHASH");
+ goto fail;
+ }
+ } else {
+ if (root->info.hash_version == DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash code is SIPHASH, but hash not in dirent");
+ goto fail;
+ }
+ }
if (fname)
hinfo = &fname->hinfo;
hinfo->hash_version = root->info.hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (fname && fname_name(fname))
+ /* hash is already computed for encrypted casefolded directory */
+ if (fname && fname_name(fname) &&
+ !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)))
ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
@@ -956,7 +979,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* If the hash is 1, then continue only if the next page has a
* continuation hash of any value. This is used for readdir
* handling. Otherwise, check to see if the hash matches the
- * desired contiuation hash. If it doesn't, return since
+ * desired continuation hash. If it doesn't, return since
* there's no point to read in the successive index pages.
*/
bhash = dx_get_hash(p->at);
@@ -997,6 +1020,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
+ int csum = ext4_has_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1005,9 +1029,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
+ /* csum entries are not larger in the casefolded encrypted case */
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(0));
+ ext4_dir_rec_len(0,
+ csum ? NULL : dir));
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
err = fscrypt_prepare_readdir(dir);
@@ -1031,7 +1057,17 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* silently ignore the rest of the block */
break;
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ if (de->name_len && de->inode) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ hinfo->hash = 0;
+ hinfo->minor_hash = 0;
+ }
+ } else {
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1100,7 +1136,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
start_hash, start_minor_hash));
dir = file_inode(dir_file);
if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
- hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ hinfo.hash_version = DX_HASH_SIPHASH;
+ else
+ hinfo.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
@@ -1218,7 +1258,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ if (ext4_hash_in_dirent(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else
+ ext4fs_dirhash(dir, de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
@@ -1282,47 +1325,65 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
* Returns: 0 if the directory entry matches, more than 0 if it
* doesn't match or less than zero on error.
*/
-int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- const struct qstr *entry, bool quick)
+static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
+ u8 *de_name, size_t de_name_len, bool quick)
{
const struct super_block *sb = parent->i_sb;
const struct unicode_map *um = sb->s_encoding;
+ struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
+ struct qstr entry = QSTR_INIT(de_name, de_name_len);
int ret;
+ if (IS_ENCRYPTED(parent)) {
+ const struct fscrypt_str encrypted_name =
+ FSTR_INIT(de_name, de_name_len);
+
+ decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
+ if (!decrypted_name.name)
+ return -ENOMEM;
+ ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
+ &decrypted_name);
+ if (ret < 0)
+ goto out;
+ entry.name = decrypted_name.name;
+ entry.len = decrypted_name.len;
+ }
+
if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
+ ret = utf8_strncasecmp_folded(um, name, &entry);
else
- ret = utf8_strncasecmp(um, name, entry);
-
+ ret = utf8_strncasecmp(um, name, &entry);
if (ret < 0) {
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
*/
if (sb_has_strict_encoding(sb))
- return -EINVAL;
-
- if (name->len != entry->len)
- return 1;
-
- return !!memcmp(name->name, entry->name, name->len);
+ ret = -EINVAL;
+ else if (name->len != entry.len)
+ ret = 1;
+ else
+ ret = !!memcmp(name->name, entry.name, entry.len);
}
-
+out:
+ kfree(decrypted_name.name);
return ret;
}
-void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
- struct fscrypt_str *cf_name)
+int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+ struct ext4_filename *name)
{
+ struct fscrypt_str *cf_name = &name->cf_name;
+ struct dx_hash_info *hinfo = &name->hinfo;
int len;
if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {
cf_name->name = NULL;
- return;
+ return 0;
}
cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
if (!cf_name->name)
- return;
+ return -ENOMEM;
len = utf8_casefold(dir->i_sb->s_encoding,
iname, cf_name->name,
@@ -1330,10 +1391,18 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
if (len <= 0) {
kfree(cf_name->name);
cf_name->name = NULL;
- return;
}
cf_name->len = (unsigned) len;
+ if (!IS_ENCRYPTED(dir))
+ return 0;
+ hinfo->hash_version = DX_HASH_SIPHASH;
+ hinfo->seed = NULL;
+ if (cf_name->name)
+ ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
+ else
+ ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
+ return 0;
}
#endif
@@ -1342,14 +1411,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
*
* Return: %true if the directory entry matches, otherwise %false.
*/
-static inline bool ext4_match(const struct inode *parent,
+static bool ext4_match(struct inode *parent,
const struct ext4_filename *fname,
- const struct ext4_dir_entry_2 *de)
+ struct ext4_dir_entry_2 *de)
{
struct fscrypt_name f;
-#ifdef CONFIG_UNICODE
- const struct qstr entry = {.name = de->name, .len = de->name_len};
-#endif
if (!de->inode)
return false;
@@ -1365,10 +1431,19 @@ static inline bool ext4_match(const struct inode *parent,
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
.len = fname->cf_name.len};
- return !ext4_ci_compare(parent, &cf, &entry, true);
+ if (IS_ENCRYPTED(parent)) {
+ if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash !=
+ EXT4_DIRENT_MINOR_HASH(de)) {
+
+ return 0;
+ }
+ }
+ return !ext4_ci_compare(parent, &cf, de->name,
+ de->name_len, true);
}
- return !ext4_ci_compare(parent, fname->usr_fname, &entry,
- false);
+ return !ext4_ci_compare(parent, fname->usr_fname, de->name,
+ de->name_len, false);
}
#endif
@@ -1739,11 +1814,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- static const struct qstr dotdot = QSTR_INIT("..", 2);
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh)
@@ -1765,7 +1839,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+dx_move_dirents(struct inode *dir, char *from, char *to,
+ struct dx_map_entry *map, int count,
unsigned blocksize)
{
unsigned rec_len = 0;
@@ -1773,11 +1848,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
while (count--) {
struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
+
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
ext4_rec_len_to_disk(rec_len, blocksize);
+
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+
map++;
to += rec_len;
}
@@ -1788,7 +1871,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
+ unsigned int blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
@@ -1797,7 +1881,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
while ((char*)de < base + blocksize) {
next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
if (de > to)
memmove(to, de, rec_len);
to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
@@ -1887,9 +1971,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split,
+ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split,
blocksize);
- de = dx_pack_dirents(data1, blocksize);
+ de = dx_pack_dirents(dir, data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
blocksize);
@@ -1937,7 +2021,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 **dest_de)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
+ unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir);
int nlen, rlen;
unsigned int offset = 0;
char *top;
@@ -1950,7 +2034,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return -EFSCORRUPTED;
if (ext4_match(dir, fname, de))
return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
break;
@@ -1964,7 +2048,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return 0;
}
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname)
@@ -1972,7 +2057,7 @@ void ext4_insert_dentry(struct inode *inode,
int nlen, rlen;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
@@ -1986,6 +2071,13 @@ void ext4_insert_dentry(struct inode *inode,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
+ if (ext4_hash_in_dirent(dir)) {
+ struct dx_hash_info *hinfo = &fname->hinfo;
+
+ EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
+ EXT4_DIRENT_HASHES(de)->minor_hash =
+ cpu_to_le32(hinfo->minor_hash);
+ }
}
/*
@@ -2022,7 +2114,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
}
/* By now the buffer is marked for journaling */
- ext4_insert_dentry(inode, de, blocksize, fname);
+ ext4_insert_dentry(dir, inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
@@ -2102,6 +2194,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
data2 = bh2->b_data;
memcpy(data2, de, len);
+ memset(de, 0, len); /* wipe old data */
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
@@ -2114,11 +2207,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
- blocksize);
+ de->rec_len = ext4_rec_len_to_disk(
+ blocksize - ext4_dir_rec_len(2, NULL), blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ root->info.hash_version = DX_HASH_SIPHASH;
+ else
+ root->info.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
+
entries = root->entries;
dx_set_block(entries, 1);
dx_set_count(entries, 1);
@@ -2129,7 +2227,11 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
+
+ /* casefolded encrypted hashes are computed on fname setup */
+ if (!ext4_hash_in_dirent(dir))
+ ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), &fname->hinfo);
memset(frames, 0, sizeof(frames));
frame = frames;
@@ -2139,10 +2241,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (retval)
- goto out_frames;
+ goto out_frames;
retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (retval)
- goto out_frames;
+ goto out_frames;
de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
if (IS_ERR(de)) {
@@ -2482,15 +2584,27 @@ int ext4_generic_delete_entry(struct inode *dir,
entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
- if (pde)
+ if (pde) {
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
blocksize) +
ext4_rec_len_from_disk(de->rec_len,
blocksize),
blocksize);
- else
+
+ /* wipe entire dir_entry */
+ memset(de, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize));
+ } else {
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0,
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+ }
+
inode_inc_iversion(dir);
return 0;
}
@@ -2722,7 +2836,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
{
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
strcpy(de->name, ".");
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
@@ -2732,11 +2846,12 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
de->name_len = 2;
if (!dotdot_real_len)
de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + EXT4_DIR_REC_LEN(1)),
+ (csum_size + ext4_dir_rec_len(1, NULL)),
blocksize);
else
de->rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(de->name_len), blocksize);
+ ext4_dir_rec_len(de->name_len, NULL),
+ blocksize);
strcpy(de->name, "..");
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
@@ -2869,7 +2984,8 @@ bool ext4_empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ if (inode->i_size < ext4_dir_rec_len(1, NULL) +
+ ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
return true;
}
@@ -3372,7 +3488,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
* for transaction commit if we are running out of space
* and thus we deadlock. So we have to stop transaction now
* and restart it when symlink contents is written.
- *
+ *
* To keep fs consistent in case of crash, we have to put inode
* to orphan list in the mean time.
*/
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3868377dec2d..7dc94f3e18e6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -667,9 +667,6 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
ext4_commit_super(sb);
}
- if (sb_rdonly(sb) || continue_fs)
- return;
-
/*
* We force ERRORS_RO behavior when system is rebooting. Otherwise we
* could panic during 'reboot -f' as the underlying device got already
@@ -679,6 +676,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
+
+ if (sb_rdonly(sb) || continue_fs)
+ return;
+
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible before
@@ -1688,7 +1689,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
- Opt_prefetch_block_bitmaps,
+ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
@@ -1788,7 +1789,9 @@ static const match_table_t tokens = {
{Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
- {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
+ {Opt_removed, "prefetch_block_bitmaps"},
+ {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
+ {Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1821,6 +1824,8 @@ static ext4_fsblk_t get_sb_block(void **data)
}
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
+
static const char deprecated_msg[] =
"Mount option \"%s\" will be removed by %s\n"
"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
@@ -2007,8 +2012,9 @@ static const struct mount_opts {
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
- {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
+ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
+ {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
@@ -2090,9 +2096,15 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb,
return 1;
}
+struct ext4_parsed_options {
+ unsigned long journal_devnum;
+ unsigned int journal_ioprio;
+ int mb_optimize_scan;
+};
+
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
- substring_t *args, unsigned long *journal_devnum,
- unsigned int *journal_ioprio, int is_remount)
+ substring_t *args, struct ext4_parsed_options *parsed_opts,
+ int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
const struct mount_opts *m;
@@ -2249,7 +2261,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"Cannot specify journal on remount");
return -1;
}
- *journal_devnum = arg;
+ parsed_opts->journal_devnum = arg;
} else if (token == Opt_journal_path) {
char *journal_path;
struct inode *journal_inode;
@@ -2285,7 +2297,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
- *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev);
path_put(&path);
kfree(journal_path);
} else if (token == Opt_journal_ioprio) {
@@ -2294,7 +2306,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
" (must be 0-7)");
return -1;
}
- *journal_ioprio =
+ parsed_opts->journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
return ext4_set_test_dummy_encryption(sb, opt, &args[0],
@@ -2384,6 +2396,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_mount_opt |= m->mount_opt;
} else if (token == Opt_data_err_ignore) {
sbi->s_mount_opt &= ~m->mount_opt;
+ } else if (token == Opt_mb_optimize_scan) {
+ if (arg != 0 && arg != 1) {
+ ext4_msg(sb, KERN_WARNING,
+ "mb_optimize_scan should be set to 0 or 1.");
+ return -1;
+ }
+ parsed_opts->mb_optimize_scan = arg;
} else {
if (!args->from)
arg = 1;
@@ -2411,8 +2430,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
static int parse_options(char *options, struct super_block *sb,
- unsigned long *journal_devnum,
- unsigned int *journal_ioprio,
+ struct ext4_parsed_options *ret_opts,
int is_remount)
{
struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
@@ -2432,8 +2450,8 @@ static int parse_options(char *options, struct super_block *sb,
*/
args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
- if (handle_mount_opt(sb, p, token, args, journal_devnum,
- journal_ioprio, is_remount) < 0)
+ if (handle_mount_opt(sb, p, token, args, ret_opts,
+ is_remount) < 0)
return 0;
}
#ifdef CONFIG_QUOTA
@@ -3023,9 +3041,6 @@ static void ext4_orphan_cleanup(struct super_block *sb,
sb->s_flags &= ~SB_RDONLY;
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= SB_ACTIVE;
-
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
@@ -3691,11 +3706,11 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
elr->lr_super = sb;
elr->lr_first_not_zeroed = start;
- if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
- elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
- else {
+ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
elr->lr_mode = EXT4_LI_MODE_ITABLE;
elr->lr_next_group = start;
+ } else {
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
}
/*
@@ -3726,7 +3741,7 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) &&
+ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
(first_not_zeroed == ngroups || sb_rdonly(sb) ||
!test_opt(sb, INIT_INODE_TABLE)))
goto out;
@@ -4015,7 +4030,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
unsigned long offset = 0;
- unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
const char *descr;
@@ -4026,8 +4040,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int needs_recovery, has_huge_files;
__u64 blocks_count;
int err = 0;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
+ struct ext4_parsed_options parsed_opts;
+
+ /* Set defaults for the variables that will be set during parsing */
+ parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ parsed_opts.journal_devnum = 0;
+ parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
if ((data && !orig_data) || !sbi)
goto out_free_base;
@@ -4273,8 +4292,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
GFP_KERNEL);
if (!s_mount_opts)
goto failed_mount;
- if (!parse_options(s_mount_opts, sb, &journal_devnum,
- &journal_ioprio, 0)) {
+ if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) {
ext4_msg(sb, KERN_WARNING,
"failed to parse options in superblock: %s",
s_mount_opts);
@@ -4282,8 +4300,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
kfree(s_mount_opts);
}
sbi->s_def_mount_opt = sbi->s_mount_opt;
- if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, 0))
+ if (!parse_options((char *) data, sb, &parsed_opts, 0))
goto failed_mount;
#ifdef CONFIG_UNICODE
@@ -4292,12 +4309,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
struct unicode_map *encoding;
__u16 encoding_flags;
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Can't mount with encoding and encryption");
- goto failed_mount;
- }
-
if (ext4_sb_read_encoding(es, &encoding_info,
&encoding_flags)) {
ext4_msg(sb, KERN_ERR,
@@ -4774,7 +4785,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, journal_devnum);
+ err = ext4_load_journal(sb, es, parsed_opts.journal_devnum);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
@@ -4874,7 +4885,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
}
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
sbi->s_journal->j_submit_inode_data_buffers =
ext4_journal_submit_inode_data_buffers;
@@ -4980,6 +4991,19 @@ no_journal:
ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
+
+ /*
+ * Enable optimize_scan if number of groups is > threshold. This can be
+ * turned off by passing "mb_optimize_scan=0". This can also be
+ * turned on forcefully by passing "mb_optimize_scan=1".
+ */
+ if (parsed_opts.mb_optimize_scan == 1)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else if (parsed_opts.mb_optimize_scan == 0)
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
@@ -4996,7 +5020,7 @@ no_journal:
ext4_journal_commit_callback;
block = ext4_count_free_clusters(sb);
- ext4_free_blocks_count_set(sbi->s_es,
+ ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
GFP_KERNEL);
@@ -5561,8 +5585,10 @@ static int ext4_commit_super(struct super_block *sb)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh || block_device_ejected(sb))
- return error;
+ if (!sbh)
+ return -EINVAL;
+ if (block_device_ejected(sb))
+ return -ENODEV;
ext4_update_super(sb);
@@ -5813,13 +5839,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
struct ext4_mount_options old_opts;
int enable_quota = 0;
ext4_group_t g;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
#ifdef CONFIG_QUOTA
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
+ struct ext4_parsed_options parsed_opts;
+
+ parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ parsed_opts.journal_devnum = 0;
if (data && !orig_data)
return -ENOMEM;
@@ -5850,7 +5879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ parsed_opts.journal_ioprio =
+ sbi->s_journal->j_task->io_context->ioprio;
/*
* Some options can be enabled by ext4 and/or by VFS mount flag
@@ -5860,7 +5890,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
vfs_flags = SB_LAZYTIME | SB_I_VERSION;
sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
+ if (!parse_options(data, sb, &parsed_opts, 1)) {
err = -EINVAL;
goto restore_opts;
}
@@ -5910,7 +5940,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal) {
ext4_init_journal_params(sb, sbi->s_journal);
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
}
/* Flush outstanding errors before changing fs state */
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index a3d08276d441..6f825dedc3d4 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -215,6 +215,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -263,6 +264,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(mb_max_inode_prealloc),
+ ATTR_LIST(mb_max_linear_groups),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -313,6 +315,7 @@ EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
+EXT4_ATTR_FEATURE(encrypted_casefold);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -330,6 +333,7 @@ static struct attribute *ext4_feat_attrs[] = {
#endif
ATTR_LIST(metadata_csum_seed),
ATTR_LIST(fast_commit),
+ ATTR_LIST(encrypted_casefold),
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
@@ -528,6 +532,10 @@ int ext4_register_sysfs(struct super_block *sb)
ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
+ proc_create_single_data("mb_stats", 0444, sbi->s_proc,
+ ext4_seq_mb_stats_show, sb);
+ proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
+ &ext4_mb_seq_structs_summary_ops, sb);
}
return 0;
}
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 07438f46b558..eacbd489e3bf 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *addr;
page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, page, offset_in_page(pos), n);
put_page(page);
@@ -80,7 +77,6 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
void *fsdata;
- void *addr;
int res;
res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
@@ -88,9 +84,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
page, fsdata);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6c1018223c54..10ba4b24a0aa 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1617,7 +1617,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
* If storing the value in an external inode is an option,
* reserve space for xattr entries/names in the external
* attribute block so that a long value does not occupy the
- * whole space and prevent futher entries being added.
+ * whole space and prevent further entries being added.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
new_size && is_block &&
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 62e638a49bbf..7669de7b49ce 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -7,6 +7,13 @@ config F2FS_FS
select CRYPTO_CRC32
select F2FS_FS_XATTR if FS_ENCRYPTION
select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
+ select LZ4_COMPRESS if F2FS_FS_LZ4
+ select LZ4_DECOMPRESS if F2FS_FS_LZ4
+ select LZ4HC_COMPRESS if F2FS_FS_LZ4HC
+ select LZO_COMPRESS if F2FS_FS_LZO
+ select LZO_DECOMPRESS if F2FS_FS_LZO
+ select ZSTD_COMPRESS if F2FS_FS_ZSTD
+ select ZSTD_DECOMPRESS if F2FS_FS_ZSTD
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -94,8 +101,6 @@ config F2FS_FS_COMPRESSION
config F2FS_FS_LZO
bool "LZO compression support"
depends on F2FS_FS_COMPRESSION
- select LZO_COMPRESS
- select LZO_DECOMPRESS
default y
help
Support LZO compress algorithm, if unsure, say Y.
@@ -103,8 +108,6 @@ config F2FS_FS_LZO
config F2FS_FS_LZ4
bool "LZ4 compression support"
depends on F2FS_FS_COMPRESSION
- select LZ4_COMPRESS
- select LZ4_DECOMPRESS
default y
help
Support LZ4 compress algorithm, if unsure, say Y.
@@ -113,7 +116,6 @@ config F2FS_FS_LZ4HC
bool "LZ4HC compression support"
depends on F2FS_FS_COMPRESSION
depends on F2FS_FS_LZ4
- select LZ4HC_COMPRESS
default y
help
Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
@@ -122,8 +124,6 @@ config F2FS_FS_LZ4HC
config F2FS_FS_ZSTD
bool "ZSTD compression support"
depends on F2FS_FS_COMPRESSION
- select ZSTD_COMPRESS
- select ZSTD_DECOMPRESS
default y
help
Support ZSTD compress algorithm, if unsure, say Y.
@@ -132,8 +132,6 @@ config F2FS_FS_LZORLE
bool "LZO-RLE compression support"
depends on F2FS_FS_COMPRESSION
depends on F2FS_FS_LZO
- select LZO_COMPRESS
- select LZO_DECOMPRESS
default y
help
Support LZO-RLE compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 965037a9c205..239ad9453b99 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count)
static inline int f2fs_acl_count(size_t size)
{
ssize_t s;
+
size -= sizeof(struct f2fs_acl_header);
s = size - 4 * sizeof(struct f2fs_acl_entry_short);
if (s < 0) {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index be5415a0dbbc..f795049e63d5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+
err = recover_orphan_inode(sbi, ino);
if (err) {
f2fs_put_page(page, 1);
@@ -1456,7 +1457,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
orphan_blocks);
if (__remain_node_summaries(cpc->reason))
- ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
else
@@ -1818,7 +1819,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
llist_add(&req.llnode, &cprc->issue_list);
atomic_inc(&cprc->queued_ckpt);
- /* update issue_list before we wake up issue_checkpoint thread */
+ /*
+ * update issue_list before we wake up issue_checkpoint thread,
+ * this smp_mb() pairs with another barrier in ___wait_event(),
+ * see more details in comments of waitqueue_active().
+ */
smp_mb();
if (waitqueue_active(&cprc->ckpt_wait_queue))
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 77fa342de38f..925a5ca3744a 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -76,12 +76,6 @@ bool f2fs_is_compressed_page(struct page *page)
return false;
if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
return false;
- /*
- * page->private may be set with pid.
- * pid_max is enough to check if it is traced.
- */
- if (IS_IO_TRACED_PAGE(page))
- return false;
f2fs_bug_on(F2FS_M_SB(page->mapping),
*((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
@@ -123,19 +117,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len)
f2fs_drop_rpages(cc, len, true);
}
-static void f2fs_put_rpages_mapping(struct address_space *mapping,
- pgoff_t start, int len)
-{
- int i;
-
- for (i = 0; i < len; i++) {
- struct page *page = find_get_page(mapping, start + i);
-
- put_page(page);
- put_page(page);
- }
-}
-
static void f2fs_put_rpages_wbc(struct compress_ctx *cc,
struct writeback_control *wbc, bool redirty, int unlock)
{
@@ -164,13 +145,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
return cc->rpages ? 0 : -ENOMEM;
}
-void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse)
{
page_array_free(cc->inode, cc->rpages, cc->cluster_size);
cc->rpages = NULL;
cc->nr_rpages = 0;
cc->nr_cpages = 0;
- cc->cluster_idx = NULL_CLUSTER;
+ if (!reuse)
+ cc->cluster_idx = NULL_CLUSTER;
}
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
@@ -896,7 +878,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
static bool __cluster_may_compress(struct compress_ctx *cc)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
loff_t i_size = i_size_read(cc->inode);
unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE);
int i;
@@ -904,12 +885,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
for (i = 0; i < cc->cluster_size; i++) {
struct page *page = cc->rpages[i];
- f2fs_bug_on(sbi, !page);
-
- if (unlikely(f2fs_cp_error(sbi)))
- return false;
- if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
- return false;
+ f2fs_bug_on(F2FS_I_SB(cc->inode), !page);
/* beyond EOF */
if (page->index >= nr_pages)
@@ -1048,7 +1024,7 @@ retry:
}
if (PageUptodate(page))
- unlock_page(page);
+ f2fs_put_page(page, 1);
else
f2fs_compress_ctx_add_page(cc, page);
}
@@ -1058,33 +1034,35 @@ retry:
ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
&last_block_in_bio, false, true);
- f2fs_destroy_compress_ctx(cc);
+ f2fs_put_rpages(cc);
+ f2fs_destroy_compress_ctx(cc, true);
if (ret)
- goto release_pages;
+ goto out;
if (bio)
f2fs_submit_bio(sbi, bio, DATA);
ret = f2fs_init_compress_ctx(cc);
if (ret)
- goto release_pages;
+ goto out;
}
for (i = 0; i < cc->cluster_size; i++) {
f2fs_bug_on(sbi, cc->rpages[i]);
page = find_lock_page(mapping, start_idx + i);
- f2fs_bug_on(sbi, !page);
+ if (!page) {
+ /* page can be truncated */
+ goto release_and_retry;
+ }
f2fs_wait_on_page_writeback(page, DATA, true, true);
-
f2fs_compress_ctx_add_page(cc, page);
- f2fs_put_page(page, 0);
if (!PageUptodate(page)) {
+release_and_retry:
+ f2fs_put_rpages(cc);
f2fs_unlock_rpages(cc, i + 1);
- f2fs_put_rpages_mapping(mapping, start_idx,
- cc->cluster_size);
- f2fs_destroy_compress_ctx(cc);
+ f2fs_destroy_compress_ctx(cc, true);
goto retry;
}
}
@@ -1115,10 +1093,10 @@ retry:
}
unlock_pages:
+ f2fs_put_rpages(cc);
f2fs_unlock_rpages(cc, i);
-release_pages:
- f2fs_put_rpages_mapping(mapping, start_idx, i);
- f2fs_destroy_compress_ctx(cc);
+ f2fs_destroy_compress_ctx(cc, true);
+out:
return ret;
}
@@ -1153,7 +1131,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
set_cluster_dirty(&cc);
f2fs_put_rpages_wbc(&cc, NULL, false, 1);
- f2fs_destroy_compress_ctx(&cc);
+ f2fs_destroy_compress_ctx(&cc, false);
return first_index;
}
@@ -1353,6 +1331,7 @@ unlock_continue:
if (fio.compr_blocks)
f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false);
f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true);
+ add_compr_block_stat(inode, cc->nr_cpages);
set_inode_flag(cc->inode, FI_APPEND_WRITE);
if (cc->cluster_idx == 0)
@@ -1372,7 +1351,7 @@ unlock_continue:
f2fs_put_rpages(cc);
page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
- f2fs_destroy_compress_ctx(cc);
+ f2fs_destroy_compress_ctx(cc, false);
return 0;
out_destroy_crypt:
@@ -1383,7 +1362,8 @@ out_destroy_crypt:
for (i = 0; i < cc->nr_cpages; i++) {
if (!cc->cpages[i])
continue;
- f2fs_put_page(cc->cpages[i], 1);
+ f2fs_compress_free_page(cc->cpages[i]);
+ cc->cpages[i] = NULL;
}
out_put_cic:
kmem_cache_free(cic_entry_slab, cic);
@@ -1533,7 +1513,7 @@ write:
err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
f2fs_put_rpages_wbc(cc, wbc, false, 0);
destroy_out:
- f2fs_destroy_compress_ctx(cc);
+ f2fs_destroy_compress_ctx(cc, false);
return err;
}
diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/fs/f2fs/compress.h
+++ /dev/null
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e5257c763d0..009a09fb9d88 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1086,6 +1086,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
for (; count > 0; dn->ofs_in_node++) {
block_t blkaddr = f2fs_data_blkaddr(dn);
+
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -1722,7 +1723,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
return __get_data_block(inode, iblock, bh_result, create,
F2FS_GET_BLOCK_DIO, NULL,
f2fs_rw_hint_to_seg_type(inode->i_write_hint),
- IS_SWAPFILE(inode) ? false : true);
+ true);
}
static int get_data_block_dio(struct inode *inode, sector_t iblock,
@@ -1837,6 +1838,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int ret = 0;
bool compr_cluster = false;
unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+ loff_t maxbytes;
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
ret = f2fs_precache_extents(inode);
@@ -1850,6 +1852,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
inode_lock(inode);
+ maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+ if (start > maxbytes) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ if (len > maxbytes || (maxbytes - len) < start)
+ len = maxbytes - start;
+
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
ret = f2fs_xattr_fiemap(inode, fieinfo);
goto out;
@@ -2276,7 +2287,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
max_nr_pages,
&last_block_in_bio,
rac != NULL, false);
- f2fs_destroy_compress_ctx(&cc);
+ f2fs_destroy_compress_ctx(&cc, false);
if (ret)
goto set_error_page;
}
@@ -2321,7 +2332,7 @@ next_page:
max_nr_pages,
&last_block_in_bio,
rac != NULL, false);
- f2fs_destroy_compress_ctx(&cc);
+ f2fs_destroy_compress_ctx(&cc, false);
}
}
#endif
@@ -3022,7 +3033,7 @@ next:
}
}
if (f2fs_compressed_file(inode))
- f2fs_destroy_compress_ctx(&cc);
+ f2fs_destroy_compress_ctx(&cc, false);
#endif
if (retry) {
index = 0;
@@ -3755,6 +3766,7 @@ int f2fs_migrate_page(struct address_space *mapping,
if (atomic_written) {
struct inmem_pages *cur;
+
list_for_each_entry(cur, &fi->inmem_pages, list)
if (cur->page == page) {
cur->page = newpage;
@@ -3780,11 +3792,72 @@ int f2fs_migrate_page(struct address_space *mapping,
#endif
#ifdef CONFIG_SWAP
+static int f2fs_is_file_aligned(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ block_t main_blkaddr = SM_I(sbi)->main_blkaddr;
+ block_t cur_lblock;
+ block_t last_lblock;
+ block_t pblock;
+ unsigned long nr_pblocks;
+ unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int not_aligned = 0;
+ int ret = 0;
+
+ cur_lblock = 0;
+ last_lblock = bytes_to_blks(inode, i_size_read(inode));
+
+ while (cur_lblock < last_lblock) {
+ struct f2fs_map_blocks map;
+
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = cur_lblock;
+ map.m_len = last_lblock - cur_lblock;
+ map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
+
+ ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+ if (ret)
+ goto out;
+
+ /* hole */
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ f2fs_err(sbi, "Swapfile has holes\n");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ pblock = map.m_pblk;
+ nr_pblocks = map.m_len;
+
+ if ((pblock - main_blkaddr) & (blocks_per_sec - 1) ||
+ nr_pblocks & (blocks_per_sec - 1)) {
+ if (f2fs_is_pinned_file(inode)) {
+ f2fs_err(sbi, "Swapfile does not align to section");
+ ret = -EINVAL;
+ goto out;
+ }
+ not_aligned++;
+ }
+
+ cur_lblock += nr_pblocks;
+ }
+ if (not_aligned)
+ f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n"
+ "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()",
+ not_aligned);
+out:
+ return ret;
+}
+
static int check_swap_activate_fast(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
{
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
sector_t cur_lblock;
sector_t last_lblock;
sector_t pblock;
@@ -3792,8 +3865,9 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
sector_t highest_pblock = 0;
int nr_extents = 0;
unsigned long nr_pblocks;
- u64 len;
- int ret;
+ unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int not_aligned = 0;
+ int ret = 0;
/*
* Map all the blocks into the extent list. This code doesn't try
@@ -3801,31 +3875,44 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
*/
cur_lblock = 0;
last_lblock = bytes_to_blks(inode, i_size_read(inode));
- len = i_size_read(inode);
- while (cur_lblock <= last_lblock && cur_lblock < sis->max) {
+ while (cur_lblock < last_lblock && cur_lblock < sis->max) {
struct f2fs_map_blocks map;
- pgoff_t next_pgofs;
cond_resched();
memset(&map, 0, sizeof(map));
map.m_lblk = cur_lblock;
- map.m_len = bytes_to_blks(inode, len) - cur_lblock;
- map.m_next_pgofs = &next_pgofs;
+ map.m_len = last_lblock - cur_lblock;
+ map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
if (ret)
- goto err_out;
+ goto out;
/* hole */
- if (!(map.m_flags & F2FS_MAP_FLAGS))
- goto err_out;
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ f2fs_err(sbi, "Swapfile has holes\n");
+ ret = -EINVAL;
+ goto out;
+ }
pblock = map.m_pblk;
nr_pblocks = map.m_len;
+ if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) ||
+ nr_pblocks & (blocks_per_sec - 1)) {
+ if (f2fs_is_pinned_file(inode)) {
+ f2fs_err(sbi, "Swapfile does not align to section");
+ ret = -EINVAL;
+ goto out;
+ }
+ not_aligned++;
+ }
+
if (cur_lblock + nr_pblocks >= sis->max)
nr_pblocks = sis->max - cur_lblock;
@@ -3852,11 +3939,13 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
sis->max = cur_lblock;
sis->pages = cur_lblock - 1;
sis->highest_bit = cur_lblock - 1;
+
+ if (not_aligned)
+ f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n"
+ "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()",
+ not_aligned);
out:
return ret;
-err_out:
- pr_err("swapon: swapfile has holes\n");
- return -EINVAL;
}
/* Copied from generic_swapfile_activate() to check any holes */
@@ -3865,6 +3954,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
{
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned blocks_per_page;
unsigned long page_no;
sector_t probe_block;
@@ -3872,11 +3962,15 @@ static int check_swap_activate(struct swap_info_struct *sis,
sector_t lowest_block = -1;
sector_t highest_block = 0;
int nr_extents = 0;
- int ret;
+ int ret = 0;
if (PAGE_SIZE == F2FS_BLKSIZE)
return check_swap_activate_fast(sis, swap_file, span);
+ ret = f2fs_is_file_aligned(inode);
+ if (ret)
+ goto out;
+
blocks_per_page = bytes_to_blks(inode, PAGE_SIZE);
/*
@@ -3891,13 +3985,14 @@ static int check_swap_activate(struct swap_info_struct *sis,
unsigned block_in_page;
sector_t first_block;
sector_t block = 0;
- int err = 0;
cond_resched();
block = probe_block;
- err = bmap(inode, &block);
- if (err || !block)
+ ret = bmap(inode, &block);
+ if (ret)
+ goto out;
+ if (!block)
goto bad_bmap;
first_block = block;
@@ -3913,9 +4008,10 @@ static int check_swap_activate(struct swap_info_struct *sis,
block_in_page++) {
block = probe_block + block_in_page;
- err = bmap(inode, &block);
-
- if (err || !block)
+ ret = bmap(inode, &block);
+ if (ret)
+ goto out;
+ if (!block)
goto bad_bmap;
if (block != first_block + block_in_page) {
@@ -3955,7 +4051,7 @@ reprobe:
out:
return ret;
bad_bmap:
- pr_err("swapon: swapfile has holes\n");
+ f2fs_err(sbi, "Swapfile has holes\n");
return -EINVAL;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91855d5721cd..c03949a7ccff 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->util_invalid = 50 - si->util_free - si->util_valid;
for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
+
si->curseg[i] = curseg->segno;
si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
@@ -300,10 +301,12 @@ get_cache:
si->page_mem = 0;
if (sbi->node_inode) {
unsigned npages = NODE_MAPPING(sbi)->nrpages;
+
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
if (sbi->meta_inode) {
unsigned npages = META_MAPPING(sbi)->nrpages;
+
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e6270a867be1..dc7ce79672b8 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -449,9 +449,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct qstr dotdot = QSTR_INIT("..", 2);
-
- return f2fs_find_entry(dir, &dotdot, p);
+ return f2fs_find_entry(dir, &dotdot_name, p);
}
ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
@@ -473,6 +471,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
+
lock_page(page);
f2fs_wait_on_page_writeback(page, type, true, true);
de->ino = cpu_to_le32(inode->i_ino);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 11a20dc505aa..c83d90125ebd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_MOUNT_ATGC 0x08000000
#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
+#define F2FS_MOUNT_GC_MERGE 0x20000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -637,21 +638,26 @@ enum {
#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT)
#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
-#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
-#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
+
#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT)
#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT)
-#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+
#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
+
#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT)
#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT)
#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT)
+
#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT)
#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT)
@@ -860,7 +866,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
+ struct rw_semaphore nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1297,14 +1303,6 @@ enum {
#define IS_DUMMY_WRITTEN_PAGE(page) \
(page_private(page) == DUMMY_WRITTEN_PAGE)
-#ifdef CONFIG_F2FS_IO_TRACE
-#define IS_IO_TRACED_PAGE(page) \
- (page_private(page) > 0 && \
- page_private(page) < (unsigned long)PID_MAX_LIMIT)
-#else
-#define IS_IO_TRACED_PAGE(page) (0)
-#endif
-
/* For compression */
enum compress_algorithm_type {
COMPRESS_LZO,
@@ -1623,6 +1621,11 @@ struct f2fs_sb_info {
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
+
+ /* For runtime compression statistics */
+ u64 compr_written_block;
+ u64 compr_saved_block;
+ u32 compr_new_inode;
#endif
};
@@ -2215,6 +2218,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ void *tmp_ptr = &ckpt->sit_nat_version_bitmap;
int offset;
if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
@@ -2224,7 +2228,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
* if large_nat_bitmap feature is enabled, leave checksum
* protection for all nat/sit bitmaps.
*/
- return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
+ return tmp_ptr + offset + sizeof(__le32);
}
if (__cp_payload(sbi) > 0) {
@@ -2235,7 +2239,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
} else {
offset = (flag == NAT_BITMAP) ?
le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
- return &ckpt->sit_nat_version_bitmap + offset;
+ return tmp_ptr + offset;
}
}
@@ -3302,7 +3306,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname);
/*
* node.c
*/
-struct dnode_of_data;
struct node_info;
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
@@ -3379,6 +3382,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi);
int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno);
void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
@@ -3386,7 +3390,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
unsigned int *newseg, bool new_sec, int dir);
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type);
+void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3550,7 +3554,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
int f2fs_start_gc_thread(struct f2fs_sb_info *sbi);
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force,
unsigned int segno);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
@@ -3952,12 +3956,24 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
void f2fs_put_page_dic(struct page *page);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
-void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi);
void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
int __init f2fs_init_compress_cache(void);
void f2fs_destroy_compress_cache(void);
+#define inc_compr_inode_stat(inode) \
+ do { \
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \
+ sbi->compr_new_inode++; \
+ } while (0)
+#define add_compr_block_stat(inode, blocks) \
+ do { \
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \
+ int diff = F2FS_I(inode)->i_cluster_size - blocks; \
+ sbi->compr_written_block += blocks; \
+ sbi->compr_saved_block += diff; \
+ } while (0)
#else
static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -3986,6 +4002,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return
static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
static inline int __init f2fs_init_compress_cache(void) { return 0; }
static inline void f2fs_destroy_compress_cache(void) { }
+#define inc_compr_inode_stat(inode) do { } while (0)
#endif
static inline void set_compress_context(struct inode *inode)
@@ -4009,6 +4026,7 @@ static inline void set_compress_context(struct inode *inode)
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
+ inc_compr_inode_stat(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -4179,8 +4197,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
if (F2FS_IO_ALIGNED(sbi))
return true;
}
- if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) &&
- !IS_SWAPFILE(inode))
+ if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
return true;
return false;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8a56acbcee4c..ceb575f99048 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1622,9 +1622,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
.m_may_create = true };
- pgoff_t pg_end;
+ pgoff_t pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
+ block_t expanded = 0;
int err;
err = inode_newsize_ok(inode, (len + offset));
@@ -1637,11 +1638,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
f2fs_balance_fs(sbi, true);
+ pg_start = ((unsigned long long)offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
off_end = (offset + len) & (PAGE_SIZE - 1);
- map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
- map.m_len = pg_end - map.m_lblk;
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
if (off_end)
map.m_len++;
@@ -1649,19 +1651,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
return 0;
if (f2fs_is_pinned_file(inode)) {
- block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
- sbi->log_blocks_per_seg;
- block_t done = 0;
+ block_t sec_blks = BLKS_PER_SEC(sbi);
+ block_t sec_len = roundup(map.m_len, sec_blks);
- if (map.m_len % sbi->blocks_per_seg)
- len += sbi->blocks_per_seg;
-
- map.m_len = sbi->blocks_per_seg;
+ map.m_len = sec_blks;
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+ err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
@@ -1669,7 +1667,7 @@ next_alloc:
down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
- f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED);
+ f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
f2fs_unlock_op(sbi);
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
@@ -1677,24 +1675,25 @@ next_alloc:
up_write(&sbi->pin_sem);
- done += map.m_len;
- len -= map.m_len;
+ expanded += map.m_len;
+ sec_len -= map.m_len;
map.m_lblk += map.m_len;
- if (!err && len)
+ if (!err && sec_len)
goto next_alloc;
- map.m_len = done;
+ map.m_len = expanded;
} else {
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ expanded = map.m_len;
}
out_err:
if (err) {
pgoff_t last_off;
- if (!map.m_len)
+ if (!expanded)
return err;
- last_off = map.m_lblk + map.m_len - 1;
+ last_off = pg_start + expanded - 1;
/* update new size to the failed position */
new_size = (last_off == pg_end) ? offset + len :
@@ -1818,7 +1817,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
struct f2fs_inode_info *fi = F2FS_I(inode);
u32 masked_flags = fi->i_flags & mask;
- f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask));
+ /* mask can be shrunk by flags_valid selector */
+ iflags &= mask;
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode))
@@ -2434,7 +2434,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
+ ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
out:
mnt_drop_write_file(filp);
return ret;
@@ -2470,7 +2470,8 @@ do_more:
down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start));
+ ret = f2fs_gc(sbi, range->sync, true, false,
+ GET_SEGNO(sbi, range->start));
if (ret) {
if (ret == -EBUSY)
ret = -EAGAIN;
@@ -2527,7 +2528,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
{
struct inode *inode = file_inode(filp);
struct f2fs_map_blocks map = { .m_next_extent = NULL,
- .m_seg_type = NO_CHECK_TYPE ,
+ .m_seg_type = NO_CHECK_TYPE,
.m_may_create = false };
struct extent_info ei = {0, 0, 0};
pgoff_t pg_start, pg_end, next_pgofs;
@@ -2923,7 +2924,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
sm->last_victim[GC_CB] = end_segno + 1;
sm->last_victim[GC_GREEDY] = end_segno + 1;
sm->last_victim[ALLOC_NEXT] = end_segno + 1;
- ret = f2fs_gc(sbi, true, true, start_segno);
+ ret = f2fs_gc(sbi, true, true, true, start_segno);
if (ret == -EAGAIN)
ret = 0;
else if (ret < 0)
@@ -4311,8 +4312,13 @@ write:
clear_inode_flag(inode, FI_NO_PREALLOC);
/* if we couldn't write data, we should deallocate blocks. */
- if (preallocated && i_size_read(inode) < target_size)
+ if (preallocated && i_size_read(inode) < target_size) {
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
f2fs_truncate(inode);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ }
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 39330ad3c44e..8d1f17ab94d8 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -31,19 +31,24 @@ static int gc_thread_func(void *data)
struct f2fs_sb_info *sbi = data;
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+ wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
unsigned int wait_ms;
wait_ms = gc_th->min_sleep_time;
set_freezable();
do {
- bool sync_mode;
+ bool sync_mode, foreground = false;
wait_event_interruptible_timeout(*wq,
kthread_should_stop() || freezing(current) ||
+ waitqueue_active(fggc_wq) ||
gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
+ if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq))
+ foreground = true;
+
/* give it a try one time */
if (gc_th->gc_wake)
gc_th->gc_wake = 0;
@@ -90,7 +95,10 @@ static int gc_thread_func(void *data)
goto do_gc;
}
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (foreground) {
+ down_write(&sbi->gc_lock);
+ goto do_gc;
+ } else if (!down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
@@ -107,14 +115,22 @@ static int gc_thread_func(void *data)
else
increase_sleep_time(gc_th, &wait_ms);
do_gc:
- stat_inc_bggc_count(sbi->stat_info);
+ if (!foreground)
+ stat_inc_bggc_count(sbi->stat_info);
sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+ /* foreground GC was been triggered via f2fs_balance_fs() */
+ if (foreground)
+ sync_mode = false;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
+ if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
+ if (foreground)
+ wake_up_all(&gc_th->fggc_wq);
+
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
@@ -144,10 +160,11 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
- gc_th->gc_wake= 0;
+ gc_th->gc_wake = 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+ init_waitqueue_head(&sbi->gc_thread->fggc_wq);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
@@ -162,9 +179,11 @@ out:
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
+ wake_up_all(&gc_th->fggc_wq);
kfree(gc_th);
sbi->gc_thread = NULL;
}
@@ -392,10 +411,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
if (p->gc_mode == GC_AT &&
get_valid_blocks(sbi, segno, true) == 0)
return;
-
- if (p->alloc_mode == AT_SSR &&
- get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0)
- return;
}
for (i = 0; i < sbi->segs_per_sec; i++)
@@ -728,11 +743,27 @@ retry:
if (sec_usage_check(sbi, secno))
goto next;
+
/* Don't touch checkpointed data */
- if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
- get_ckpt_valid_blocks(sbi, segno) &&
- p.alloc_mode == LFS))
- goto next;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (p.alloc_mode == LFS) {
+ /*
+ * LFS is set to find source section during GC.
+ * The victim should have no checkpointed data.
+ */
+ if (get_ckpt_valid_blocks(sbi, segno, true))
+ goto next;
+ } else {
+ /*
+ * SSR | AT_SSR are set to find target segment
+ * for writes which can be full by checkpointed
+ * and newly written blocks.
+ */
+ if (!f2fs_segment_has_free_slot(sbi, segno))
+ goto next;
+ }
+ }
+
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
@@ -828,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
static void put_gc_inode(struct gc_inode_list *gc_list)
{
struct inode_entry *ie, *next_ie;
+
list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
@@ -952,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
bidx = node_ofs - 1;
} else if (node_ofs <= indirect_blks) {
int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+
bidx = node_ofs - 2 - dec;
} else {
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+
bidx = node_ofs - 5 - dec;
}
return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode);
@@ -1120,7 +1154,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
block_t newaddr;
int err = 0;
bool lfs_mode = f2fs_lfs_mode(fio.sbi);
- int type = fio.sbi->am.atgc_enabled ?
+ int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) &&
+ (fio.sbi->gc_mode != GC_URGENT_HIGH) ?
CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
/* do not read out */
@@ -1354,7 +1389,8 @@ out:
* the victim data block is ignored.
*/
static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
- struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
+ struct gc_inode_list *gc_list, unsigned int segno, int gc_type,
+ bool force_migrate)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
@@ -1383,8 +1419,8 @@ next_step:
* race condition along with SSR block allocation.
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
- get_valid_blocks(sbi, segno, true) ==
- BLKS_PER_SEC(sbi))
+ (!force_migrate && get_valid_blocks(sbi, segno, true) ==
+ BLKS_PER_SEC(sbi)))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1519,7 +1555,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
static int do_garbage_collect(struct f2fs_sb_info *sbi,
unsigned int start_segno,
- struct gc_inode_list *gc_list, int gc_type)
+ struct gc_inode_list *gc_list, int gc_type,
+ bool force_migrate)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
@@ -1606,7 +1643,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
gc_type);
else
submitted += gc_data_segment(sbi, sum->entries, gc_list,
- segno, gc_type);
+ segno, gc_type,
+ force_migrate);
stat_inc_seg_count(sbi, type, gc_type);
migrated++;
@@ -1634,7 +1672,7 @@ skip:
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
- bool background, unsigned int segno)
+ bool background, bool force, unsigned int segno)
{
int gc_type = sync ? FG_GC : BG_GC;
int sec_freed = 0, seg_freed = 0, total_freed = 0;
@@ -1696,7 +1734,7 @@ gc_more:
if (ret)
goto stop;
- seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
if (gc_type == FG_GC &&
seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
sec_freed++;
@@ -1835,7 +1873,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- do_garbage_collect(sbi, segno, &gc_list, FG_GC);
+ do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
put_gc_inode(&gc_list);
if (!gc_only && get_valid_blocks(sbi, segno, true)) {
@@ -1974,7 +2012,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
/* stop CP to protect MAIN_SEC in free_segment_range */
f2fs_lock_op(sbi);
+
+ spin_lock(&sbi->stat_lock);
+ if (shrunk_blocks + valid_user_blocks(sbi) +
+ sbi->current_reserved_blocks + sbi->unusable_block_count +
+ F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
+ err = -ENOSPC;
+ spin_unlock(&sbi->stat_lock);
+
+ if (err)
+ goto out_unlock;
+
err = free_segment_range(sbi, secs, true);
+
+out_unlock:
f2fs_unlock_op(sbi);
up_write(&sbi->gc_lock);
if (err)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 0c8dae12dc51..3fe145e8e594 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -42,6 +42,12 @@ struct f2fs_gc_kthread {
/* for changing gc mode */
unsigned int gc_wake;
+
+ /* for GC_MERGE mount option */
+ wait_queue_head_t fggc_wq; /*
+ * caller of f2fs_balance_fs()
+ * will wait on this wait queue.
+ */
};
struct gc_inode_list {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 993caefcd2bb..92652ca7a7c8 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -219,7 +219,8 @@ out:
f2fs_put_page(page, 1);
- f2fs_balance_fs(sbi, dn.node_changed);
+ if (!err)
+ f2fs_balance_fs(sbi, dn.node_changed);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 349d9cb933ee..b401f08569f7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -666,6 +666,7 @@ retry:
node_page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
int err = PTR_ERR(node_page);
+
if (err == -ENOMEM) {
cond_resched();
goto retry;
@@ -698,7 +699,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
/*
* We need to balance fs here to prevent from producing dirty node pages
- * during the urgent cleaning time when runing out of free sections.
+ * during the urgent cleaning time when running out of free sections.
*/
f2fs_update_inode_page(inode);
if (wbc && wbc->nr_to_write)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 14bf4f65bcb3..a9cd9cf97229 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -416,9 +416,9 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
- struct qstr dotdot = QSTR_INIT("..", 2);
struct page *page;
- unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page);
+
if (!ino) {
if (IS_ERR(page))
return ERR_CAST(page);
@@ -628,6 +628,7 @@ static const char *f2fs_get_link(struct dentry *dentry,
struct delayed_call *done)
{
const char *link = page_get_link(dentry, inode, done);
+
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
do_delayed_call(done);
@@ -766,6 +767,7 @@ out_fail:
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+
if (f2fs_empty_dir(inode))
return f2fs_unlink(dir, dentry);
return -ENOTEMPTY;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4b0e2e3c2c88..e67ce5f13b98 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct sysinfo val;
unsigned long avail_ram;
unsigned long mem_size = 0;
bool res = false;
+ if (!nm_i)
+ return true;
+
si_meminfo(&val);
/* only uses low memory */
@@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
/* it allows 20% / total_ram for inmemory pages */
mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
res = mem_size < (val.totalram / 5);
+ } else if (type == DISCARD_CACHE) {
+ mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
+ sizeof(struct discard_cmd)) >> PAGE_SHIFT;
+ res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
} else {
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
return true;
@@ -462,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
/* increment version no as node is removed */
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
+
nat_set_version(e, inc_node_version(version));
}
@@ -1383,7 +1392,7 @@ repeat:
goto out_err;
}
page_hit:
- if(unlikely(nid != nid_of_node(page))) {
+ if (unlikely(nid != nid_of_node(page))) {
f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
@@ -1775,7 +1784,7 @@ continue_unlock:
out:
if (nwritten)
f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
- return ret ? -EIO: 0;
+ return ret ? -EIO : 0;
}
static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
@@ -2117,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi,
struct free_nid *i)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
-
int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+
if (err)
return err;
@@ -2785,6 +2794,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct f2fs_nat_entry raw_ne;
nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
+ if (f2fs_check_nid_range(sbi, nid))
+ continue;
+
raw_ne = nat_in_journal(journal, i);
ne = __lookup_nat_cache(nm_i, nid);
@@ -2980,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
+
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index f84541b57acb..7a45c0f10629 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -147,6 +147,7 @@ enum mem_type {
INO_ENTRIES, /* indicates inode entries */
EXTENT_CACHE, /* indicates extent cache */
INMEM_PAGES, /* indicates inmemory pages */
+ DISCARD_CACHE, /* indicates memory of cached discard cmds */
BASE_CHECK, /* check kernel status */
};
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index da75d5d52f0a..422146c6d866 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
/* Get the previous summary */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
+
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
goto got_it;
@@ -875,5 +876,5 @@ out:
#endif
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
- return ret ? ret: err;
+ return ret ? ret : err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c2866561263e..51dc79fad4fe 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
{
struct inmem_pages *new;
- f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
+ if (PagePrivate(page))
+ set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+ else
+ f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -324,23 +327,27 @@ void f2fs_drop_inmem_pages(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- while (!list_empty(&fi->inmem_pages)) {
+ do {
mutex_lock(&fi->inmem_lock);
+ if (list_empty(&fi->inmem_pages)) {
+ fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
+
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ if (f2fs_is_atomic_file(inode)) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ sbi->atomic_files--;
+ }
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+ mutex_unlock(&fi->inmem_lock);
+ break;
+ }
__revoke_inmem_pages(inode, &fi->inmem_pages,
true, false, true);
mutex_unlock(&fi->inmem_lock);
- }
-
- fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
-
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(inode, FI_ATOMIC_FILE);
- sbi->atomic_files--;
- }
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ } while (1);
}
void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
@@ -503,8 +510,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
* dir/node pages without enough free segments.
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
- down_write(&sbi->gc_lock);
- f2fs_gc(sbi, false, false, NULL_SEGNO);
+ if (test_opt(sbi, GC_MERGE) && sbi->gc_thread &&
+ sbi->gc_thread->f2fs_gc_task) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ wake_up(&sbi->gc_thread->gc_wait_queue_head);
+ io_schedule();
+ finish_wait(&sbi->gc_thread->fggc_wq, &wait);
+ } else {
+ down_write(&sbi->gc_lock);
+ f2fs_gc(sbi, false, false, false, NULL_SEGNO);
+ }
}
}
@@ -653,7 +671,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
llist_add(&cmd.llnode, &fcc->issue_list);
- /* update issue_list before we wake up issue_flush thread */
+ /*
+ * update issue_list before we wake up issue_flush thread, this
+ * smp_mb() pairs with another barrier in ___wait_event(), see
+ * more details in comments of waitqueue_active().
+ */
smp_mb();
if (waitqueue_active(&fcc->flush_wait_queue))
@@ -861,7 +883,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_lock(&dirty_i->seglist_lock);
valid_blocks = get_valid_blocks(sbi, segno, false);
- ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno);
+ ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false);
if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
ckpt_valid_blocks == usable_blocks)) {
@@ -946,7 +968,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
if (get_valid_blocks(sbi, segno, false))
continue;
- if (get_ckpt_valid_blocks(sbi, segno))
+ if (get_ckpt_valid_blocks(sbi, segno, false))
continue;
mutex_unlock(&dirty_i->seglist_lock);
return segno;
@@ -1095,6 +1117,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
struct discard_policy *dpolicy,
int discard_type, unsigned int granularity)
{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
/* common policy */
dpolicy->type = discard_type;
dpolicy->sync = true;
@@ -1114,7 +1138,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->ordered = true;
if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
dpolicy->granularity = 1;
- dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+ if (atomic_read(&dcc->discard_cmd_cnt))
+ dpolicy->max_interval =
+ DEF_MIN_DISCARD_ISSUE_TIME;
}
} else if (discard_type == DPOLICY_FORCE) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1730,8 +1756,15 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
- __init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
- dcc->discard_granularity);
+ if (sbi->gc_mode == GC_URGENT_HIGH ||
+ !f2fs_available_free_memory(sbi, DISCARD_CACHE))
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+ else
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
+ dcc->discard_granularity);
+
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ wait_ms = dpolicy.max_interval;
wait_event_interruptible_timeout(*q,
kthread_should_stop() || freezing(current) ||
@@ -1755,9 +1788,8 @@ static int issue_discard_thread(void *data)
wait_ms = dpolicy.max_interval;
continue;
}
-
- if (sbi->gc_mode == GC_URGENT_HIGH)
- __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ continue;
sb_start_intwrite(sbi->sb);
@@ -1765,7 +1797,7 @@ static int issue_discard_thread(void *data)
if (issued > 0) {
__wait_all_discard_cmd(sbi, &dpolicy);
wait_ms = dpolicy.min_interval;
- } else if (issued == -1){
+ } else if (issued == -1) {
wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
if (!wait_ms)
wait_ms = dpolicy.mid_interval;
@@ -2142,6 +2174,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
unsigned int segno, int modified)
{
struct seg_entry *se = get_seg_entry(sbi, segno);
+
se->type = type;
if (modified)
__mark_sit_entry_dirty(sbi, segno);
@@ -2333,6 +2366,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
void *addr = curseg->sum_blk;
+
addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
}
@@ -2604,22 +2638,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->alloc_type = LFS;
}
-static void __next_free_blkoff(struct f2fs_sb_info *sbi,
- struct curseg_info *seg, block_t start)
+static int __next_free_blkoff(struct f2fs_sb_info *sbi,
+ int segno, block_t start)
{
- struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+ struct seg_entry *se = get_seg_entry(sbi, segno);
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
unsigned long *target_map = SIT_I(sbi)->tmp_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
- int i, pos;
+ int i;
for (i = 0; i < entries; i++)
target_map[i] = ckpt_map[i] | cur_map[i];
- pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
-
- seg->next_blkoff = pos;
+ return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
}
/*
@@ -2631,11 +2663,18 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
struct curseg_info *seg)
{
if (seg->alloc_type == SSR)
- __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+ seg->next_blkoff =
+ __next_free_blkoff(sbi, seg->segno,
+ seg->next_blkoff + 1);
else
seg->next_blkoff++;
}
+bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
+{
+ return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
+}
+
/*
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
@@ -2661,7 +2700,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
reset_curseg(sbi, type, 1);
curseg->alloc_type = SSR;
- __next_free_blkoff(sbi, curseg, 0);
+ curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0);
sum_page = f2fs_get_sum_page(sbi, new_segno);
if (IS_ERR(sum_page)) {
@@ -2893,7 +2932,8 @@ unlock:
up_read(&SM_I(sbi)->curseg_lock);
}
-static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
+static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
+ bool new_sec, bool force)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
@@ -2901,32 +2941,43 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
if (!curseg->inited)
goto alloc;
- if (!curseg->next_blkoff &&
- !get_valid_blocks(sbi, curseg->segno, false) &&
- !get_ckpt_valid_blocks(sbi, curseg->segno))
- return;
+ if (force || curseg->next_blkoff ||
+ get_valid_blocks(sbi, curseg->segno, new_sec))
+ goto alloc;
+ if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
+ return;
alloc:
old_segno = curseg->segno;
SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
locate_dirty_segment(sbi, old_segno);
}
-void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type)
+static void __allocate_new_section(struct f2fs_sb_info *sbi,
+ int type, bool force)
+{
+ __allocate_new_segment(sbi, type, true, force);
+}
+
+void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
+ down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
- __allocate_new_segment(sbi, type);
+ __allocate_new_section(sbi, type, force);
up_write(&SIT_I(sbi)->sentry_lock);
+ up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
+ down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segment(sbi, i);
+ __allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
+ up_read(&SM_I(sbi)->curseg_lock);
}
static const struct segment_allocation default_salloc_ops = {
@@ -3239,7 +3290,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
struct inode *inode = fio->page->mapping->host;
if (is_cold_data(fio->page)) {
- if (fio->sbi->am.atgc_enabled)
+ if (fio->sbi->am.atgc_enabled &&
+ (fio->io_type == FS_DATA_IO) &&
+ (fio->sbi->gc_mode != GC_URGENT_HIGH))
return CURSEG_ALL_DATA_ATGC;
else
return CURSEG_COLD_DATA;
@@ -3365,12 +3418,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
f2fs_inode_chksum_set(sbi, page);
}
- if (F2FS_IO_ALIGNED(sbi))
- fio->retry = false;
-
if (fio) {
struct f2fs_bio_info *io;
+ if (F2FS_IO_ALIGNED(sbi))
+ fio->retry = false;
+
INIT_LIST_HEAD(&fio->list);
fio->in_list = true;
io = sbi->write_io[fio->type] + fio->temp;
@@ -3499,7 +3552,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
__func__, segno);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto drop_bio;
+ }
+
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) {
+ err = -EIO;
+ goto drop_bio;
}
stat_inc_inplace_blocks(fio->sbi);
@@ -3514,6 +3573,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
}
return err;
+drop_bio:
+ if (fio->bio && *(fio->bio)) {
+ struct bio *bio = *(fio->bio);
+
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ *(fio->bio) = NULL;
+ }
+ return err;
}
static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
@@ -3539,6 +3607,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct seg_entry *se;
int type;
unsigned short old_blkoff;
+ unsigned char old_alloc_type;
segno = GET_SEGNO(sbi, new_blkaddr);
se = get_seg_entry(sbi, segno);
@@ -3572,6 +3641,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
old_cursegno = curseg->segno;
old_blkoff = curseg->next_blkoff;
+ old_alloc_type = curseg->alloc_type;
/* change the current segment */
if (segno != curseg->segno) {
@@ -3606,6 +3676,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
change_curseg(sbi, type, true);
}
curseg->next_blkoff = old_blkoff;
+ curseg->alloc_type = old_alloc_type;
}
up_write(&sit_i->sentry_lock);
@@ -3717,6 +3788,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
for (j = 0; j < blk_off; j++) {
struct f2fs_summary *s;
+
s = (struct f2fs_summary *)(kaddr + offset);
seg_i->sum_blk->entries[j] = *s;
offset += SUMMARY_SIZE;
@@ -3779,6 +3851,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
if (__exist_node_summaries(sbi)) {
struct f2fs_summary *ns = &sum->entries[0];
int i;
+
for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
ns->version = 0;
ns->ofs_in_node = 0;
@@ -3880,6 +3953,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 3: write summary entries */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
unsigned short blkoff;
+
seg_i = CURSEG_I(sbi, i);
if (sbi->ckpt->alloc_type[i] == SSR)
blkoff = sbi->blocks_per_seg;
@@ -3916,6 +3990,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
int i, end;
+
if (IS_DATASEG(type))
end = type + NR_CURSEG_DATA_TYPE;
else
@@ -4499,6 +4574,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
/* set use the current segments */
for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+
__set_test_and_inuse(sbi, curseg_t->segno);
}
}
@@ -4731,7 +4807,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
}
static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
- void *data) {
+ void *data)
+{
memcpy(data, zone, sizeof(struct blk_zone));
return 0;
}
@@ -4783,7 +4860,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
f2fs_notice(sbi, "Assign new section to curseg[%d]: "
"curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
- allocate_segment_by_default(sbi, type, true);
+
+ f2fs_allocate_new_section(sbi, type, true);
/* check consistency of the zone curseg pointed to */
if (check_zone_write_pointer(sbi, zbd, &zone))
@@ -4847,8 +4925,10 @@ struct check_zone_write_pointer_args {
};
static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
- void *data) {
+ void *data)
+{
struct check_zone_write_pointer_args *args;
+
args = (struct check_zone_write_pointer_args *)data;
return check_zone_write_pointer(args->sbi, args->fdev, zone);
@@ -5127,6 +5207,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
kvfree(dirty_i->victim_secmap);
}
@@ -5171,6 +5252,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
static void destroy_free_segmap(struct f2fs_sb_info *sbi)
{
struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+
if (!free_i)
return;
SM_I(sbi)->free_info = NULL;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e9a7a637d688..050230c70a53 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -172,12 +172,10 @@ enum {
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
- * FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
FG_GC,
- FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
@@ -361,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
}
static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
- unsigned int segno)
+ unsigned int segno, bool use_section)
{
+ if (use_section && __is_large_section(sbi)) {
+ unsigned int start_segno = START_SEGNO(segno);
+ unsigned int blocks = 0;
+ int i;
+
+ for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
+ struct seg_entry *se = get_seg_entry(sbi, start_segno);
+
+ blocks += se->ckpt_valid_blocks;
+ }
+ return blocks;
+ }
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 82592b19b4e0..7d325bfaf65a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -151,6 +151,8 @@ enum {
Opt_compress_chksum,
Opt_compress_mode,
Opt_atgc,
+ Opt_gc_merge,
+ Opt_nogc_merge,
Opt_err,
};
@@ -223,6 +225,8 @@ static match_table_t f2fs_tokens = {
{Opt_compress_chksum, "compress_chksum"},
{Opt_compress_mode, "compress_mode=%s"},
{Opt_atgc, "atgc"},
+ {Opt_gc_merge, "gc_merge"},
+ {Opt_nogc_merge, "nogc_merge"},
{Opt_err, NULL},
};
@@ -555,6 +559,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
/*
@@ -1073,6 +1078,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_atgc:
set_opt(sbi, ATGC);
break;
+ case Opt_gc_merge:
+ set_opt(sbi, GC_MERGE);
+ break;
+ case Opt_nogc_merge:
+ clear_opt(sbi, GC_MERGE);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1616,6 +1627,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
#endif
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
static inline void f2fs_show_compress_options(struct seq_file *seq,
struct super_block *sb)
{
@@ -1661,6 +1673,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER)
seq_printf(seq, ",compress_mode=%s", "user");
}
+#endif
static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
@@ -1673,6 +1686,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
seq_printf(seq, ",background_gc=%s", "off");
+ if (test_opt(sbi, GC_MERGE))
+ seq_puts(seq, ",gc_merge");
+
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, NORECOVERY))
@@ -1824,6 +1840,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, EXTENT_CACHE);
set_opt(sbi, NOHEAP);
clear_opt(sbi, DISABLE_CHECKPOINT);
+ set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
@@ -1865,7 +1882,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+ err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
break;
@@ -1876,7 +1893,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
ret = sync_filesystem(sbi->sb);
if (ret || err) {
- err = ret ? ret: err;
+ err = ret ? ret : err;
goto restore_flag;
}
@@ -1925,8 +1942,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
struct f2fs_mount_info org_mount_opt;
unsigned long old_sb_flags;
int err;
- bool need_restart_gc = false;
- bool need_stop_gc = false;
+ bool need_restart_gc = false, need_stop_gc = false;
+ bool need_restart_ckpt = false, need_stop_ckpt = false;
+ bool need_restart_flush = false, need_stop_flush = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
@@ -2035,7 +2053,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* option. Also sync the filesystem.
*/
if ((*flags & SB_RDONLY) ||
- F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
+ (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF &&
+ !test_opt(sbi, GC_MERGE))) {
if (sbi->gc_thread) {
f2fs_stop_gc_thread(sbi);
need_restart_gc = true;
@@ -2057,18 +2076,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
clear_sbi_flag(sbi, SBI_IS_CLOSE);
}
- if (checkpoint_changed) {
- if (test_opt(sbi, DISABLE_CHECKPOINT)) {
- err = f2fs_disable_checkpoint(sbi);
- if (err)
- goto restore_gc;
- } else {
- f2fs_enable_checkpoint(sbi);
- }
- }
-
- if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
- test_opt(sbi, MERGE_CHECKPOINT)) {
+ if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+ !test_opt(sbi, MERGE_CHECKPOINT)) {
+ f2fs_stop_ckpt_thread(sbi);
+ need_restart_ckpt = true;
+ } else {
err = f2fs_start_ckpt_thread(sbi);
if (err) {
f2fs_err(sbi,
@@ -2076,8 +2088,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
err);
goto restore_gc;
}
- } else {
- f2fs_stop_ckpt_thread(sbi);
+ need_stop_ckpt = true;
}
/*
@@ -2087,11 +2098,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
clear_opt(sbi, FLUSH_MERGE);
f2fs_destroy_flush_cmd_control(sbi, false);
+ need_restart_flush = true;
} else {
err = f2fs_create_flush_cmd_control(sbi);
if (err)
- goto restore_gc;
+ goto restore_ckpt;
+ need_stop_flush = true;
}
+
+ if (checkpoint_changed) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+ err = f2fs_disable_checkpoint(sbi);
+ if (err)
+ goto restore_flush;
+ } else {
+ f2fs_enable_checkpoint(sbi);
+ }
+ }
+
skip:
#ifdef CONFIG_QUOTA
/* Release old quota file names */
@@ -2106,6 +2130,21 @@ skip:
adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
+restore_flush:
+ if (need_restart_flush) {
+ if (f2fs_create_flush_cmd_control(sbi))
+ f2fs_warn(sbi, "background flush thread has stopped");
+ } else if (need_stop_flush) {
+ clear_opt(sbi, FLUSH_MERGE);
+ f2fs_destroy_flush_cmd_control(sbi, false);
+ }
+restore_ckpt:
+ if (need_restart_ckpt) {
+ if (f2fs_start_ckpt_thread(sbi))
+ f2fs_warn(sbi, "background ckpt thread has stopped");
+ } else if (need_stop_ckpt) {
+ f2fs_stop_ckpt_thread(sbi);
+ }
restore_gc:
if (need_restart_gc) {
if (f2fs_start_gc_thread(sbi))
@@ -3719,7 +3758,7 @@ try_onemore:
sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
for (i = 0; i < NR_PAGE_TYPE; i++) {
- int n = (i == META) ? 1: NR_TEMP_TYPE;
+ int n = (i == META) ? 1 : NR_TEMP_TYPE;
int j;
sbi->write_io[i] =
@@ -3833,7 +3872,7 @@ try_onemore:
/* setup checkpoint request control and start checkpoint issue thread */
f2fs_init_ckpt_req_control(sbi);
- if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+ if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) &&
test_opt(sbi, MERGE_CHECKPOINT)) {
err = f2fs_start_ckpt_thread(sbi);
if (err) {
@@ -3929,10 +3968,18 @@ try_onemore:
* previous checkpoint was not done by clean system shutdown.
*/
if (f2fs_hw_is_readonly(sbi)) {
- if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))
- f2fs_err(sbi, "Need to recover fsync data, but write access unavailable");
- else
- f2fs_info(sbi, "write access unavailable, skipping recovery");
+ if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ err = f2fs_recover_fsync_data(sbi, true);
+ if (err > 0) {
+ err = -EROFS;
+ f2fs_err(sbi, "Need to recover fsync data, but "
+ "write access unavailable, please try "
+ "mount w/ disable_roll_forward or norecovery");
+ }
+ if (err < 0)
+ goto free_meta;
+ }
+ f2fs_info(sbi, "write access unavailable, skipping recovery");
goto reset_checkpoint;
}
@@ -3989,7 +4036,8 @@ reset_checkpoint:
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
+ if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF ||
+ test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e38a7f6921dd..39b522ec73e7 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/unicode.h>
#include <linux/ioprio.h>
+#include <linux/sysfs.h>
#include "f2fs.h"
#include "segment.h"
@@ -91,6 +92,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
(unsigned long long)(free_segments(sbi)));
}
+static ssize_t ovp_segments_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(overprovision_segments(sbi)));
+}
+
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -282,6 +290,17 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return len;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!strcmp(a->attr.name, "compr_written_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->compr_written_block);
+
+ if (!strcmp(a->attr.name, "compr_saved_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block);
+
+ if (!strcmp(a->attr.name, "compr_new_inode"))
+ return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
+#endif
+
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@@ -458,6 +477,24 @@ out:
return count;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!strcmp(a->attr.name, "compr_written_block") ||
+ !strcmp(a->attr.name, "compr_saved_block")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->compr_written_block = 0;
+ sbi->compr_saved_block = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "compr_new_inode")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->compr_new_inode = 0;
+ return count;
+ }
+#endif
+
*ui = (unsigned int)t;
return count;
@@ -629,6 +666,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
+F2FS_GENERAL_RO_ATTR(ovp_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
@@ -668,6 +706,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode);
#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -715,6 +756,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
+ ATTR_LIST(ovp_segments),
ATTR_LIST(unusable),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(features),
@@ -731,6 +773,11 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(moved_blocks_background),
ATTR_LIST(avg_vblocks),
#endif
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ ATTR_LIST(compr_written_block),
+ ATTR_LIST(compr_saved_block),
+ ATTR_LIST(compr_new_inode),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index a7beff28a3c5..03549b5ba204 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
size_t desc_size, u64 merkle_tree_size)
{
struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
struct fsverity_descriptor_location dloc = {
.version = cpu_to_le32(F2FS_VERIFY_VER),
.size = cpu_to_le32(desc_size),
.pos = cpu_to_le64(desc_pos),
};
- int err = 0;
+ int err = 0, err2 = 0;
- if (desc != NULL) {
- /* Succeeded; write the verity descriptor. */
- err = pagecache_write(inode, desc, desc_size, desc_pos);
+ /*
+ * If an error already occurred (which fs/verity/ signals by passing
+ * desc == NULL), then only clean-up is needed.
+ */
+ if (desc == NULL)
+ goto cleanup;
- /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
- if (!err)
- err = filemap_write_and_wait(inode->i_mapping);
- }
+ /* Append the verity descriptor. */
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ goto cleanup;
+
+ /*
+ * Write all pages (both data and verity metadata). Note that this must
+ * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond
+ * i_size won't be written properly. For crash consistency, this also
+ * must happen before the verity inode flag gets persisted.
+ */
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto cleanup;
+
+ /* Set the verity xattr. */
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+ F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+ NULL, XATTR_CREATE);
+ if (err)
+ goto cleanup;
- /* If we failed, truncate anything we wrote past i_size. */
- if (desc == NULL || err)
- f2fs_truncate(inode);
+ /* Finally, set the verity inode flag. */
+ file_set_verity(inode);
+ f2fs_set_inode_flags(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+ return 0;
- if (desc != NULL && !err) {
- err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
- F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
- NULL, XATTR_CREATE);
- if (!err) {
- file_set_verity(inode);
- f2fs_set_inode_flags(inode);
- f2fs_mark_inode_dirty_sync(inode, true);
- }
+cleanup:
+ /*
+ * Verity failed to be enabled, so clean up by truncating any verity
+ * metadata that was written beyond i_size (both from cache and from
+ * disk) and clearing FI_VERITY_IN_PROGRESS.
+ *
+ * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection
+ * from re-instantiating cached pages we are truncating (since unlike
+ * normal file accesses, garbage collection isn't limited by i_size).
+ */
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ err2 = f2fs_truncate(inode);
+ if (err2) {
+ f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)",
+ err2);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- return err;
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+ return err ?: err2;
}
static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 490f843ec3bf..c8f34decbf8e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -488,6 +488,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
f2fs_wait_on_page_writeback(xpage, NODE, true, true);
} else {
struct dnode_of_data dn;
+
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
if (IS_ERR(xpage)) {
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f7e3304b7802..860e884e56e8 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -771,7 +771,7 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
/*
* FAT data is organized as clusters, trim at the granulary of cluster.
*
- * fstrim_range is in byte, convert vaules to cluster index.
+ * fstrim_range is in byte, convert values to cluster index.
* Treat sectors before data region as all used, not to trim them.
*/
ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
diff --git a/fs/file.c b/fs/file.c
index f633348029a5..86dc9956af32 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1081,8 +1081,6 @@ out_unlock:
/**
* __receive_fd() - Install received file into file descriptor table
- *
- * @fd: fd to install into (if negative, a new fd will be allocated)
* @file: struct file that was received from another process
* @ufd: __user pointer to write new fd number to
* @o_flags: the O_* flags to apply to the new fd entry
@@ -1096,7 +1094,7 @@ out_unlock:
*
* Returns newly install fd or -ve on error.
*/
-int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
+int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
int new_fd;
int error;
@@ -1105,32 +1103,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag
if (error)
return error;
- if (fd < 0) {
- new_fd = get_unused_fd_flags(o_flags);
- if (new_fd < 0)
- return new_fd;
- } else {
- new_fd = fd;
- }
+ new_fd = get_unused_fd_flags(o_flags);
+ if (new_fd < 0)
+ return new_fd;
if (ufd) {
error = put_user(new_fd, ufd);
if (error) {
- if (fd < 0)
- put_unused_fd(new_fd);
+ put_unused_fd(new_fd);
return error;
}
}
- if (fd < 0) {
- fd_install(new_fd, get_file(file));
- } else {
- error = replace_fd(new_fd, file, o_flags);
- if (error)
- return error;
- }
+ fd_install(new_fd, get_file(file));
+ __receive_sock(file);
+ return new_fd;
+}
- /* Bump the sock usage counts, if any. */
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
+{
+ int error;
+
+ error = security_file_receive(file);
+ if (error)
+ return error;
+ error = replace_fd(new_fd, file, o_flags);
+ if (error)
+ return error;
__receive_sock(file);
return new_fd;
}
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 68b0148f4bb8..980d44fd3a36 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -310,7 +310,6 @@ EXPORT_SYMBOL(fs_param_is_path);
#ifdef CONFIG_VALIDATE_FS_PARSER
/**
* validate_constant_table - Validate a constant table
- * @name: Name to use in reporting
* @tbl: The constant table to validate.
* @tbl_size: The size of the table.
* @low: The lowest permissible value.
@@ -360,6 +359,7 @@ bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
/**
* fs_validate_description - Validate a parameter description
+ * @name: The parameter name to search for.
* @desc: The parameter description to validate.
*/
bool fs_validate_description(const char *name,
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index e9c0f916349d..52b165319be1 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -71,6 +71,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
return -EINVAL;
if (acl) {
+ unsigned int extra_flags = 0;
/*
* Fuse userspace is responsible for updating access
* permissions in the inode, if needed. fuse_setxattr
@@ -94,7 +95,11 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
return ret;
}
- ret = fuse_setxattr(inode, name, value, size, 0);
+ if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
+
+ ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
kfree(value);
} else {
ret = fuse_removexattr(inode, name);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 45082269e698..c7d882a9fe33 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -511,20 +511,18 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
&fuse_dev_fiq_ops, NULL);
+ cc->fc.release = cuse_fc_release;
fud = fuse_dev_alloc_install(&cc->fc);
- if (!fud) {
- kfree(cc);
+ fuse_conn_put(&cc->fc);
+ if (!fud)
return -ENOMEM;
- }
INIT_LIST_HEAD(&cc->list);
- cc->fc.release = cuse_fc_release;
cc->fc.initialized = 1;
rc = cuse_send_init(cc);
if (rc) {
fuse_dev_free(fud);
- fuse_conn_put(&cc->fc);
return rc;
}
file->private_data = fud;
@@ -561,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
unregister_chrdev_region(cc->cdev->dev, 1);
cdev_del(cc->cdev);
}
- /* Base reference is now owned by "fud" */
- fuse_conn_put(&cc->fc);
rc = fuse_dev_release(inode, file); /* puts the base reference */
@@ -627,6 +623,8 @@ static int __init cuse_init(void)
cuse_channel_fops.owner = THIS_MODULE;
cuse_channel_fops.open = cuse_channel_open;
cuse_channel_fops.release = cuse_channel_release;
+ /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
+ cuse_channel_fops.unlocked_ioctl = NULL;
cuse_class = class_create(THIS_MODULE, "cuse");
if (IS_ERR(cuse_class))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c0fee830a34e..a5ceccc5ef00 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2233,11 +2233,8 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
int oldfd;
struct fuse_dev *fud = NULL;
- if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC)
- return -ENOTTY;
-
- switch (_IOC_NR(cmd)) {
- case _IOC_NR(FUSE_DEV_IOC_CLONE):
+ switch (cmd) {
+ case FUSE_DEV_IOC_CLONE:
res = -EFAULT;
if (!get_user(oldfd, (__u32 __user *)arg)) {
struct file *old = fget(oldfd);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e8aa5337eb29..09ef2a4d25ed 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -802,21 +802,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
{
struct fuse_conn *fc = get_fuse_conn(inode);
- if (fc->writeback_cache) {
- /*
- * A hole in a file. Some data after the hole are in page cache,
- * but have not reached the client fs yet. So, the hole is not
- * present there.
- */
- int i;
- int start_idx = num_read >> PAGE_SHIFT;
- size_t off = num_read & (PAGE_SIZE - 1);
-
- for (i = start_idx; i < ap->num_pages; i++) {
- zero_user_segment(ap->pages[i], off, PAGE_SIZE);
- off = 0;
- }
- } else {
+ /*
+ * If writeback_cache is enabled, a short read means there's a hole in
+ * the file. Some data after the hole is in page cache, but has not
+ * reached the client fs yet. So the hole is not present there.
+ */
+ if (!fc->writeback_cache) {
loff_t pos = page_offset(ap->pages[0]) + num_read;
fuse_read_update_size(inode, pos, attr_ver);
}
@@ -1103,6 +1094,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
+ bool short_write;
int err;
for (i = 0; i < ap->num_pages; i++)
@@ -1117,32 +1109,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
if (!err && ia->write.out.size > count)
err = -EIO;
+ short_write = ia->write.out.size < count;
offset = ap->descs[0].offset;
count = ia->write.out.size;
for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i];
- if (!err && !offset && count >= PAGE_SIZE)
- SetPageUptodate(page);
-
- if (count > PAGE_SIZE - offset)
- count -= PAGE_SIZE - offset;
- else
- count = 0;
- offset = 0;
-
- unlock_page(page);
+ if (err) {
+ ClearPageUptodate(page);
+ } else {
+ if (count >= PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
+ else {
+ if (short_write)
+ ClearPageUptodate(page);
+ count = 0;
+ }
+ offset = 0;
+ }
+ if (ia->write.page_locked && (i == ap->num_pages - 1))
+ unlock_page(page);
put_page(page);
}
return err;
}
-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
+static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
unsigned int max_pages)
{
+ struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
@@ -1195,6 +1193,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
if (offset == PAGE_SIZE)
offset = 0;
+ /* If we copied full page, mark it uptodate */
+ if (tmp == PAGE_SIZE)
+ SetPageUptodate(page);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ ia->write.page_locked = true;
+ break;
+ }
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
@@ -1238,7 +1246,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
break;
}
- count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
+ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
if (count <= 0) {
err = count;
} else {
@@ -1753,8 +1761,17 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
container_of(args, typeof(*wpa), ia.ap.args);
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
mapping_set_error(inode->i_mapping, error);
+ /*
+ * A writeback finished and this might have updated mtime/ctime on
+ * server making local mtime/ctime stale. Hence invalidate attrs.
+ * Do this only if writeback_cache is not enabled. If writeback_cache
+ * is enabled, we trust local ctime/mtime.
+ */
+ if (!fc->writeback_cache)
+ fuse_invalidate_attr(inode);
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ca868b71eb97..7e463e220053 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -552,9 +552,12 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
- /** Maxmum number of pages that can be used in a single request */
+ /** Maximum number of pages that can be used in a single request */
unsigned int max_pages;
+ /** Constrain ->max_pages to this value during feature negotiation */
+ unsigned int max_pages_limit;
+
/** Input queue */
struct fuse_iqueue iq;
@@ -668,6 +671,9 @@ struct fuse_conn {
/** Is setxattr not implemented by fs? */
unsigned no_setxattr:1;
+ /** Does file server support extended setxattr */
+ unsigned setxattr_ext:1;
+
/** Is getxattr not implemented by fs? */
unsigned no_getxattr:1;
@@ -713,7 +719,7 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
- /** Filesystem is fully reponsible for page cache invalidation. */
+ /** Filesystem is fully responsible for page cache invalidation. */
unsigned explicit_inval_data:1;
/** Does the filesystem support readdirplus? */
@@ -934,6 +940,7 @@ struct fuse_io_args {
struct {
struct fuse_write_in in;
struct fuse_write_out out;
+ bool page_locked;
} write;
};
struct fuse_args_pages ap;
@@ -1193,7 +1200,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked);
bool fuse_lock_inode(struct inode *inode);
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
- size_t size, int flags);
+ size_t size, int flags, unsigned int extra_flags);
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size);
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b4b956da3851..393e36b74dc4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -712,6 +712,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+ fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
@@ -872,14 +873,13 @@ static struct dentry *fuse_get_parent(struct dentry *child)
struct inode *inode;
struct dentry *parent;
struct fuse_entry_out outarg;
- const struct qstr name = QSTR_INIT("..", 2);
int err;
if (!fc->export_support)
return ERR_PTR(-ESTALE);
err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
- &name, &outarg, &inode);
+ &dotdot_name, &outarg, &inode);
if (err) {
if (err == -ENOENT)
return ERR_PTR(-ESTALE);
@@ -1040,7 +1040,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->abort_err = 1;
if (arg->flags & FUSE_MAX_PAGES) {
fc->max_pages =
- min_t(unsigned int, FUSE_MAX_MAX_PAGES,
+ min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
}
if (IS_ENABLED(CONFIG_FUSE_DAX) &&
@@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->handle_killpriv_v2 = 1;
fm->sb->s_flags |= SB_NOSEC;
}
+ if (arg->flags & FUSE_SETXATTR_EXT)
+ fc->setxattr_ext = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
- FUSE_HANDLE_KILLPRIV_V2;
+ FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
ia->in.flags |= FUSE_MAP_ALIGNMENT;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4ee6f734ba83..bcb8a02e2d8b 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -18,6 +18,12 @@
#include <linux/uio.h>
#include "fuse_i.h"
+/* Used to help calculate the FUSE connection's max_pages limit for a request's
+ * size. Parts of the struct fuse_req are sliced into scattergather lists in
+ * addition to the pages used, so this can help account for that overhead.
+ */
+#define FUSE_HEADER_OVERHEAD 4
+
/* List of virtio-fs device instances and a lock for the list. Also provides
* mutual exclusion in device removal and mounting path
*/
@@ -127,11 +133,6 @@ static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
return &fs->vqs[vq->index];
}
-static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
-{
- return &vq_to_fsvq(vq)->fud->pq;
-}
-
/* Should be called with fsvq->lock held. */
static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
{
@@ -896,6 +897,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
vdev->config->reset(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
+ kfree(fs->vqs);
out:
vdev->priv = NULL;
@@ -1413,9 +1415,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
{
struct virtio_fs *fs;
struct super_block *sb;
- struct fuse_conn *fc;
+ struct fuse_conn *fc = NULL;
struct fuse_mount *fm;
- int err;
+ unsigned int virtqueue_size;
+ int err = -EIO;
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
@@ -1427,6 +1430,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
return -EINVAL;
}
+ virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq);
+ if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD))
+ goto out_err;
+
err = -ENOMEM;
fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
if (!fc)
@@ -1436,12 +1443,15 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
if (!fm)
goto out_err;
- fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
- &virtio_fs_fiq_ops, fs);
+ fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs);
fc->release = fuse_free_conn;
fc->delete_stale = true;
fc->auto_submounts = true;
+ /* Tell FUSE to split requests that exceed the virtqueue's size */
+ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
+ virtqueue_size - FUSE_HEADER_OVERHEAD);
+
fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
if (fsc->s_fs_info) {
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 1a7d7ace54e1..61dfaf7b7d20 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -12,7 +12,7 @@
#include <linux/posix_acl_xattr.h>
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
- size_t size, int flags)
+ size_t size, int flags, unsigned int extra_flags)
{
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
@@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
inarg.flags = flags;
+ inarg.setxattr_flags = extra_flags;
+
args.opcode = FUSE_SETXATTR;
args.nodeid = get_node_id(inode);
args.in_numargs = 3;
- args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].size = fm->fc->setxattr_ext ?
+ sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE;
args.in_args[0].value = &inarg;
args.in_args[1].size = strlen(name) + 1;
args.in_args[1].value = name;
@@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
if (!value)
return fuse_removexattr(inode, name);
- return fuse_setxattr(inode, name, value, size, flags);
+ return fuse_setxattr(inode, name, value, size, flags, 0);
}
static bool no_xattr_list(struct dentry *dentry)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a0b542d84cd9..493a83e3f590 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,8 +911,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
current->backing_dev_info = inode_to_bdi(inode);
buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
current->backing_dev_info = NULL;
- if (unlikely(buffered <= 0))
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
goto out_unlock;
+ }
/*
* We need to ensure that the page cache pages are written to
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 84c38103aa06..d9cb261f55b0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
if (mapping) {
truncate_inode_pages_final(mapping);
if (!gfs2_withdrawn(sdp))
- GLOCK_BUG_ON(gl, mapping->nrpages ||
- mapping->nrexceptional);
+ GLOCK_BUG_ON(gl, !mapping_empty(mapping));
}
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
@@ -583,6 +582,16 @@ out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
+static bool is_system_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ if (gl == m_ip->i_gl)
+ return true;
+ return false;
+}
+
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
@@ -672,17 +681,25 @@ skip_inval:
* to see sd_log_error and withdraw, and in the meantime, requeue the
* work for later.
*
+ * We make a special exception for some system glocks, such as the
+ * system statfs inode glock, which needs to be granted before the
+ * gfs2_quotad daemon can exit, and that exit needs to finish before
+ * we can unmount the withdrawn file system.
+ *
* However, if we're just unlocking the lock (say, for unmount, when
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
gfs2_withdraw_delayed(sdp);
- if (glock_blocked_by_withdraw(gl)) {
- if (target != LM_ST_UNLOCKED ||
- test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) {
+ if (glock_blocked_by_withdraw(gl) &&
+ (target != LM_ST_UNLOCKED ||
+ test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
+ if (!is_system_glock(gl)) {
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
goto out;
+ } else {
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
}
@@ -1467,9 +1484,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
glock_blocked_by_withdraw(gl) &&
gh->gh_gl != sdp->sd_jinode_gl) {
sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
might_sleep();
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
}
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1776,6 +1795,7 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
@@ -1821,7 +1841,6 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
- clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
continue;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 454095e9fedf..54d3fbeb3002 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -396,7 +396,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_flags & I_NEW;
+ bool is_new = ip->i_inode.i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 97d54e581a7b..42c15cfc0821 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -926,10 +926,10 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
}
/**
- * ail_drain - drain the ail lists after a withdraw
+ * gfs2_ail_drain - drain the ail lists after a withdraw
* @sdp: Pointer to GFS2 superblock
*/
-static void ail_drain(struct gfs2_sbd *sdp)
+void gfs2_ail_drain(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
@@ -956,6 +956,7 @@ static void ail_drain(struct gfs2_sbd *sdp)
list_del(&tr->tr_list);
gfs2_trans_free(sdp, tr);
}
+ gfs2_drain_revokes(sdp);
spin_unlock(&sdp->sd_ail_lock);
}
@@ -1162,7 +1163,6 @@ out_withdraw:
if (tr && list_empty(&tr->tr_list))
list_add(&tr->tr_list, &sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- ail_drain(sdp); /* frees all transactions */
tr = NULL;
goto out_end;
}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eea58015710e..fc905c2af53c 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -93,5 +93,6 @@ extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 221e7118cc3b..8ee05d25dfa6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -885,7 +885,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_write_page(sdp, page);
}
-static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+void gfs2_drain_revokes(struct gfs2_sbd *sdp)
{
struct list_head *head = &sdp->sd_log_revokes;
struct gfs2_bufdata *bd;
@@ -900,6 +900,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
}
}
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ gfs2_drain_revokes(sdp);
+}
+
static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 31b6dd0d2e5d..f707601597dc 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -20,6 +20,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
+extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
return sdp->sd_ldptrs;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 3e08027a6c81..f4325b44956d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -131,6 +131,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc)
return;
+ gfs2_ail_drain(sdp); /* frees all transactions */
inode = sdp->sd_jdesc->jd_inode;
ip = GFS2_I(inode);
i_gl = ip->i_gl;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index a930ddd15681..7054a542689f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode *inode)
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
if (res)
break;
- hfs_brec_remove(&fd);
- mutex_unlock(&fd.tree->tree_lock);
start = hip->cached_start;
+ if (blk_cnt <= start)
+ hfs_brec_remove(&fd);
+ mutex_unlock(&fd.tree->tree_lock);
hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->cached_extents);
+ mutex_lock(&fd.tree->tree_lock);
if (blk_cnt > start) {
hip->extent_state |= HFSPLUS_EXT_DIRTY;
break;
@@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode *inode)
alloc_cnt = start;
hip->cached_start = hip->cached_blocks = 0;
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
- mutex_lock(&fd.tree->tree_lock);
}
hfs_find_exit(&fd);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7b5e984ff02a..7d0c3dbb2898 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -316,7 +316,7 @@ retry:
if (mode & FMODE_WRITE)
r = w = 1;
- name = dentry_name(d_real(file->f_path.dentry, file->f_inode));
+ name = dentry_name(file_dentry(file));
if (name == NULL)
return -ENOMEM;
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 302f45101a96..d92c4af3e1b4 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -356,7 +356,8 @@ struct hpfs_dirent {
u8 no_of_acls; /* number of ACL's (low 3 bits) */
u8 ix; /* code page index (of filename), see
struct code_page_data */
- u8 namelen, name[1]; /* file name */
+ u8 namelen; /* file name length */
+ u8 name[]; /* file name */
/* dnode_secno down; btree down pointer, if present,
follows name on next word boundary, or maybe it
precedes next dirent, which is on a word boundary. */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 701c82c36138..55efd3dd04f6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -131,6 +131,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
@@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
vma->vm_ops = &hugetlb_vm_ops;
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
+
/*
* page based offset in vm_pgoff could be sufficiently large to
* overflow a loff_t when converted to byte offset. This can
@@ -463,14 +468,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
- struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- vma_init(&pseudo_vma, current->mm);
- pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec);
next = start;
while (next < end) {
@@ -482,10 +484,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
- u32 hash;
+ u32 hash = 0;
index = page->index;
- hash = hugetlb_fault_mutex_hash(mapping, index);
if (!truncate_op) {
/*
* Only need to hold the fault mutex in the
@@ -493,6 +494,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* page faults. Races are not possible in the
* case of truncation.
*/
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
}
@@ -527,7 +529,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* the subpool and global reserve usage count can need
* to be adjusted.
*/
- VM_BUG_ON(PagePrivate(page));
+ VM_BUG_ON(HPageRestoreReserve(page));
remove_huge_page(page);
freed++;
if (!truncate_op) {
@@ -1435,7 +1437,7 @@ static int get_hstate_idx(int page_size_log)
if (!h)
return -1;
- return h - hstates;
+ return hstate_index(h);
}
/*
diff --git a/fs/inode.c b/fs/inode.c
index 9e192bea0630..c93500d84264 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -529,7 +529,14 @@ void clear_inode(struct inode *inode)
*/
xa_lock_irq(&inode->i_data.i_pages);
BUG_ON(inode->i_data.nrpages);
- BUG_ON(inode->i_data.nrexceptional);
+ /*
+ * Almost always, mapping_empty(&inode->i_data) here; but there are
+ * two known and long-standing ways in which nodes may get left behind
+ * (when deep radix-tree node allocation failed partway; or when THP
+ * collapse_file() failed). Until those two known cases are cleaned up,
+ * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
+ * nor even WARN_ON(!mapping_empty).
+ */
xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0129e6bab985..9023717c5188 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -394,7 +394,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
struct inode *inode = rac->mapping->host;
loff_t pos = readahead_pos(rac);
- loff_t length = readahead_length(rac);
+ size_t length = readahead_length(rac);
struct iomap_readpage_ctx ctx = {
.rac = rac,
};
@@ -402,7 +402,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(inode, readahead_count(rac));
while (length > 0) {
- loff_t ret = iomap_apply(inode, pos, length, 0, ops,
+ ssize_t ret = iomap_apply(inode, pos, length, 0, ops,
&ctx, iomap_readahead_actor);
if (ret <= 0) {
WARN_ON_ONCE(ret == 0);
@@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
}
void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
- void (*merge_private)(struct iomap_ioend *ioend,
- struct iomap_ioend *next))
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
{
struct iomap_ioend *next;
@@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
- if (next->io_private && merge_private)
- merge_private(ioend, next);
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
@@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
- ioend->io_private = NULL;
ioend->io_bio = bio;
return ioend;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bdd0d89bbf0a..9398b8c31323 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -487,12 +487,28 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, pos, end)) {
+ ret = -EAGAIN;
+ goto out_free_dio;
+ }
+ iomap_flags |= IOMAP_NOWAIT;
+ }
+
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
iomap_flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, pos, end)) {
+ ret = -EAGAIN;
+ goto out_free_dio;
+ }
+ iomap_flags |= IOMAP_NOWAIT;
+ }
+
/* for data sync or sync, we need sync completion processing */
if (iocb->ki_flags & IOCB_DSYNC)
dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -507,14 +523,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_WRITE_FUA;
}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, pos, end)) {
- ret = -EAGAIN;
- goto out_free_dio;
- }
- iomap_flags |= IOMAP_NOWAIT;
- }
-
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (pos >= dio->i_size || pos + count > dio->i_size)
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 94ef92fe806c..4880146babaf 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -767,6 +767,7 @@ repeat:
rs.cont_extent = isonum_733(rr->u.CE.extent);
rs.cont_offset = isonum_733(rr->u.CE.offset);
rs.cont_size = isonum_733(rr->u.CE.size);
+ break;
default:
break;
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 69f18fe20923..d47a0d96bf30 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -245,15 +245,14 @@ static int fc_do_one_pass(journal_t *journal,
return 0;
while (next_fc_block <= journal->j_fc_last) {
- jbd_debug(3, "Fast commit replay: next block %ld",
+ jbd_debug(3, "Fast commit replay: next block %ld\n",
next_fc_block);
err = jread(&bh, journal, next_fc_block);
if (err) {
- jbd_debug(3, "Fast commit replay: read error");
+ jbd_debug(3, "Fast commit replay: read error\n");
break;
}
- jbd_debug(3, "Processing fast commit blk with seq %d");
err = journal->j_fc_replay_callback(journal, bh, pass,
next_fc_block - journal->j_fc_first,
expected_commit_id);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 9396666b7314..e8fc45fd751f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -349,7 +349,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
}
alloc_transaction:
- if (!journal->j_running_transaction) {
+ /*
+ * This check is racy but it is just an optimization of allocating new
+ * transaction early if there are high chances we'll need it. If we
+ * guess wrong, we'll retry or free unused transaction.
+ */
+ if (!data_race(journal->j_running_transaction)) {
/*
* If __GFP_FS is not present, then we may be being called from
* inside the fs writeback layer, so we MUST NOT fail.
@@ -1474,8 +1479,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* crucial to catch bugs so let's do a reliable check until the
* lockless handling is fully proven.
*/
- if (jh->b_transaction != transaction &&
- jh->b_next_transaction != transaction) {
+ if (data_race(jh->b_transaction != transaction &&
+ jh->b_next_transaction != transaction)) {
spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
@@ -1483,8 +1488,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
}
if (jh->b_modified == 1) {
/* If it's in our transaction it must be in BJ_Metadata list. */
- if (jh->b_transaction == transaction &&
- jh->b_jlist != BJ_Metadata) {
+ if (data_race(jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata)) {
spin_lock(&jh->b_state_lock);
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
deleted file mode 100644
index ca28964abd4b..000000000000
--- a/fs/jffs2/TODO
+++ /dev/null
@@ -1,37 +0,0 @@
-
- - support asynchronous operation -- add a per-fs 'reserved_space' count,
- let each outstanding write reserve the _maximum_ amount of physical
- space it could take. Let GC flush the outstanding writes because the
- reservations will necessarily be pessimistic. With this we could even
- do shared writable mmap, if we can have a fs hook for do_wp_page() to
- make the reservation.
- - disable compression in commit_write()?
- - fine-tune the allocation / GC thresholds
- - chattr support - turning on/off and tuning compression per-inode
- - checkpointing (do we need this? scan is quite fast)
- - make the scan code populate real inodes so read_inode just after
- mount doesn't have to read the flash twice for large files.
- Make this a per-inode option, changeable with chattr, so you can
- decide which inodes should be in-core immediately after mount.
- - test, test, test
-
- - NAND flash support:
- - almost done :)
- - use bad block check instead of the hardwired byte check
-
- - Optimisations:
- - Split writes so they go to two separate blocks rather than just c->nextblock.
- By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE
- nodes to a different one, we can separate clean nodes from those which
- are likely to become dirty, and end up with blocks which are each far
- closer to 100% or 0% clean, hence speeding up later GC progress dramatically.
- - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in
- the full dirent, we only need to go to the flash in lookup() when we think we've
- got a match, and in readdir().
- - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
- - Remove size from jffs2_raw_node_frag.
-
-dedekind:
-1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate.
-2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in
- case of failure? scan() does not clean everything. Fix.
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f8fb89b10227..4fc8cd698d1a 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -57,6 +57,7 @@ const struct file_operations jffs2_file_operations =
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
};
/* jffs2_file_inode_operations */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index db72a9d2d0af..b676056826be 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1079,7 +1079,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
memcpy(&fd->name, rd->name, checkedlen);
fd->name[checkedlen] = 0;
- crc = crc32(0, fd->name, rd->nsize);
+ crc = crc32(0, fd->name, checkedlen);
if (crc != je32_to_cpu(rd->name_crc)) {
pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
__func__, ofs, je32_to_cpu(rd->name_crc), crc);
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index e4131cb1f1d4..36d9a1280770 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
#define jffs2_sum_active() (0)
#define jffs2_sum_init(a) (0)
-#define jffs2_sum_exit(a)
+#define jffs2_sum_exit(a) do { } while (0)
#define jffs2_sum_disable_collecting(a)
#define jffs2_sum_is_disabled(a) (0)
-#define jffs2_sum_reset_collected(a)
+#define jffs2_sum_reset_collected(a) do { } while (0)
#define jffs2_sum_add_kvec(a,b,c,d) (0)
-#define jffs2_sum_move_collected(a,b)
+#define jffs2_sum_move_collected(a,b) do { } while (0)
#define jffs2_sum_write_sumnode(a) (0)
-#define jffs2_sum_add_padding_mem(a,b)
-#define jffs2_sum_add_inode_mem(a,b,c)
-#define jffs2_sum_add_dirent_mem(a,b,c)
-#define jffs2_sum_add_xattr_mem(a,b,c)
-#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_add_padding_mem(a,b) do { } while (0)
+#define jffs2_sum_add_inode_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_dirent_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_xattr_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_xref_mem(a,b,c) do { } while (0)
#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
#endif /* CONFIG_JFFS2_SUMMARY */
diff --git a/fs/locks.c b/fs/locks.c
index 5c42363aa811..74b2a1dfe8d8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1808,6 +1808,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
if (flags & FL_LAYOUT)
return 0;
+ if (flags & FL_DELEG)
+ /* We leave these checks to the caller */
+ return 0;
if (arg == F_RDLCK)
return inode_is_open_for_write(inode) ? -EAGAIN : 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index f63337828e1c..c3f1a78ba369 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3855,8 +3855,12 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
return -EINVAL;
+ /* Don't yet support filesystem mountable in user namespaces. */
+ if (m->mnt_sb->s_user_ns != &init_user_ns)
+ return -EINVAL;
+
/* We're not controlling the superblock. */
- if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* Mount has already been visible in the filesystem hierarchy. */
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 578112713703..b4db21022cb4 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NETFS_SUPPORT
- tristate "Support for network filesystem high-level I/O"
+ tristate
help
This option enables support for network filesystems, including
helpers for high-level buffered I/O, abstracting out read
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 193841d03de0..725614625ed4 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -1068,7 +1068,7 @@ int netfs_write_begin(struct file *file, struct address_space *mapping,
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
- page = grab_cache_page_write_begin(mapping, index, 0);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f7786e00a6a7..ed9d580826f5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -137,12 +137,12 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
if (!pnfs_layout_is_valid(lo))
continue;
- if (stateid != NULL &&
- !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+ if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
continue;
- if (!nfs_sb_active(server->super))
- continue;
- inode = igrab(lo->plh_inode);
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
rcu_read_unlock();
if (inode)
return inode;
@@ -176,9 +176,10 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
continue;
if (nfsi->layout != lo)
continue;
- if (!nfs_sb_active(server->super))
- continue;
- inode = igrab(lo->plh_inode);
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
rcu_read_unlock();
if (inode)
return inode;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ff5c4d0d6d13..cfeaadf56bf0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -476,7 +476,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_maxval = to->to_initval;
to->to_exponential = 0;
break;
-#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
case XPRT_TRANSPORT_UDP:
if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_UDP_RETRANS;
@@ -487,7 +486,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_maxval = NFS_MAX_UDP_TIMEOUT;
to->to_exponential = 1;
break;
-#endif
default:
BUG();
}
@@ -698,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = ctx->flags;
server->options = ctx->options;
- server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
- NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
- NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+ server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+
+ switch (clp->rpc_ops->version) {
+ case 2:
+ server->fattr_valid = NFS_ATTR_FATTR_V2;
+ break;
+ case 3:
+ server->fattr_valid = NFS_ATTR_FATTR_V3;
+ break;
+ default:
+ server->fattr_valid = NFS_ATTR_FATTR_V4;
+ }
if (ctx->rsize)
server->rsize = nfs_block_size(ctx->rsize, NULL);
@@ -794,6 +801,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->maxfilesize = fsinfo->maxfilesize;
server->time_delta = fsinfo->time_delta;
+ server->change_attr_type = fsinfo->change_attr_type;
server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
@@ -935,6 +943,8 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 04bf8066980c..e6ec6f09ac6e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -114,7 +114,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
return ret;
}
/**
- * nfs_have_delegation - check if inode has a delegation, mark it
+ * nfs4_have_delegation - check if inode has a delegation, mark it
* NFS_DELEGATION_REFERENCED if there is one.
* @inode: inode to check
* @flags: delegation types to check for
@@ -481,6 +481,22 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
if (freeme == NULL)
goto out;
add_new:
+ /*
+ * If we didn't revalidate the change attribute before setting
+ * the delegation, then pre-emptively ask for a full attribute
+ * cache revalidation.
+ */
+ spin_lock(&inode->i_lock);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE)
+ nfs_set_cache_invalid(inode,
+ NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA |
+ NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ spin_unlock(&inode->i_lock);
+
list_add_tail_rcu(&delegation->super_list, &server->delegations);
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
@@ -488,11 +504,6 @@ add_new:
atomic_long_inc(&nfs_active_delegations);
trace_nfs4_set_delegation(inode, type);
-
- spin_lock(&inode->i_lock);
- if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME))
- NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED;
- spin_unlock(&inode->i_lock);
out:
spin_unlock(&clp->cl_lock);
if (delegation != NULL)
@@ -674,7 +685,7 @@ void nfs_inode_evict_delegation(struct inode *inode)
}
/**
- * nfs_inode_return_delegation - synchronously return a delegation
+ * nfs4_inode_return_delegation - synchronously return a delegation
* @inode: inode to process
*
* This routine will always flush any dirty data to disk on the
@@ -697,7 +708,7 @@ int nfs4_inode_return_delegation(struct inode *inode)
}
/**
- * nfs_inode_return_delegation_on_close - asynchronously return a delegation
+ * nfs4_inode_return_delegation_on_close - asynchronously return a delegation
* @inode: inode to process
*
* This routine is called on file close in order to determine if the
@@ -811,7 +822,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
}
/**
- * nfs_super_return_all_delegations - return delegations for one superblock
+ * nfs_server_return_all_delegations - return delegations for one superblock
* @server: pointer to nfs_server to process
*
*/
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9b00a0b7f832..c19b4fd20781 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -84,8 +84,7 @@ int nfs4_inode_make_writeable(struct inode *inode);
static inline int nfs_have_delegated_attributes(struct inode *inode)
{
- return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
- !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
}
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fc4f490f2d78..1a6d2867fba4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -866,6 +866,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
break;
}
+ verf_arg = verf_res;
+
status = nfs_readdir_page_filler(desc, entry, pages, pglen,
arrays, narrays);
} while (!status && nfs_readdir_page_needs_filling(page));
@@ -927,7 +929,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
}
return res;
}
- memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
+ /*
+ * Set the cookie verifier if the page cache was empty
+ */
+ if (desc->page_index == 0)
+ memcpy(nfsi->cookieverf, verf,
+ sizeof(nfsi->cookieverf));
}
res = nfs_readdir_search_array(desc);
if (res == 0) {
@@ -974,10 +981,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
-static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
+ const __be32 *verf)
{
struct file *file = desc->file;
- struct nfs_inode *nfsi = NFS_I(file_inode(file));
struct nfs_cache_array *array;
unsigned int i = 0;
@@ -991,7 +998,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
desc->eof = true;
break;
}
- memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
+ memcpy(desc->verf, verf, sizeof(desc->verf));
if (i < (array->size-1))
desc->dir_cookie = array->array[i+1].cookie;
else
@@ -1048,7 +1055,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
desc->page = arrays[i];
- nfs_do_filldir(desc);
+ nfs_do_filldir(desc, verf);
}
desc->page = NULL;
@@ -1069,6 +1076,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
+ struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
int res;
@@ -1122,7 +1130,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
}
if (res == -ETOOSMALL && desc->plus) {
- clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
nfs_zap_caches(inode);
desc->page_index = 0;
desc->plus = false;
@@ -1132,7 +1140,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
if (res < 0)
break;
- nfs_do_filldir(desc);
+ nfs_do_filldir(desc, nfsi->cookieverf);
nfs_readdir_page_unlock_and_put_cached(desc);
} while (!desc->eof);
@@ -1703,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode)
NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
nfs_set_cache_invalid(
inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
- NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED);
+ NFS_INO_INVALID_NLINK);
spin_unlock(&inode->i_lock);
}
@@ -2940,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode))
return 0;
- if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) {
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) {
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
ret = __nfs_revalidate_inode(server, inode);
@@ -2998,16 +3006,10 @@ out_notsup:
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE |
+ NFS_INO_INVALID_OTHER);
if (res == 0)
res = generic_permission(&init_user_ns, inode, mask);
goto out;
}
EXPORT_SYMBOL_GPL(nfs_permission);
-
-/*
- * Local variables:
- * version-control: t
- * kept-new-versions: 5
- * End:
- */
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index f2b34cfe286c..37a1a88df771 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -169,19 +169,8 @@ out:
static u64 nfs_fetch_iversion(struct inode *inode)
{
- struct nfs_server *server = NFS_SERVER(inode);
-
- /* Is this the right call?: */
- nfs_revalidate_inode(server, inode);
- /*
- * Also, note we're ignoring any returned error. That seems to be
- * the practice for cache consistency information elsewhere in
- * the server, but I'm not sure why.
- */
- if (server->nfs_client->rpc_ops->version >= 4)
- return inode_peek_iversion_raw(inode);
- else
- return time_to_chattr(&inode->i_ctime);
+ nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+ return inode_peek_iversion_raw(inode);
}
const struct export_operations nfs_export_ops = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 16ad5050e046..1fef107961bc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -105,7 +105,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
if (filp->f_flags & O_DIRECT)
goto force_reval;
- if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE))
goto force_reval;
return 0;
force_reval:
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d158a500c25c..d2103852475f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -718,7 +718,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (unlikely(!p))
goto out_err;
fl->fh_array[i]->size = be32_to_cpup(p++);
- if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+ if (fl->fh_array[i]->size > NFS_MAXFHSIZE) {
printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
goto out_err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 872112bffcab..d383de00d486 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -106,7 +106,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
if (unlikely(!p))
return -ENOBUFS;
fh->size = be32_to_cpup(p++);
- if (fh->size > sizeof(struct nfs_fh)) {
+ if (fh->size > NFS_MAXFHSIZE) {
printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
fh->size);
return -EOVERFLOW;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index a06d213d7689..d95c9a39bc70 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -283,20 +283,40 @@ static int nfs_verify_server_address(struct sockaddr *addr)
return 0;
}
+#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+ return true;
+}
+#else
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+ if (ctx->version == 4)
+ return true;
+ return false;
+}
+#endif
+
/*
* Sanity check the NFS transport protocol.
- *
*/
-static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
+static int nfs_validate_transport_protocol(struct fs_context *fc,
+ struct nfs_fs_context *ctx)
{
switch (ctx->nfs_server.protocol) {
case XPRT_TRANSPORT_UDP:
+ if (nfs_server_transport_udp_invalid(ctx))
+ goto out_invalid_transport_udp;
+ break;
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_RDMA:
break;
default:
ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
}
+ return 0;
+out_invalid_transport_udp:
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
}
/*
@@ -305,8 +325,6 @@ static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
*/
static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx)
{
- nfs_validate_transport_protocol(ctx);
-
if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP ||
ctx->mount_server.protocol == XPRT_TRANSPORT_TCP)
return;
@@ -932,6 +950,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
struct nfs_fh *mntfh = ctx->mntfh;
struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+ int ret;
if (data == NULL)
goto out_no_data;
@@ -977,6 +996,15 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
sizeof(mntfh->data) - mntfh->size);
/*
+ * for proto == XPRT_TRANSPORT_UDP, which is what uses
+ * to_exponential, implying shift: limit the shift value
+ * to BITS_PER_LONG (majortimeo is unsigned long)
+ */
+ if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */
+ if (data->retrans >= 64) /* shift value is too large */
+ goto out_invalid_data;
+
+ /*
* Translate to nfs_fs_context, which nfs_fill_super
* can deal with.
*/
@@ -1048,6 +1076,10 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
goto generic;
}
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
+
ctx->skip_reconfig_option_check = true;
return 0;
@@ -1076,6 +1108,9 @@ out_no_address:
out_invalid_fh:
return nfs_invalf(fc, "NFS: invalid root filehandle");
+
+out_invalid_data:
+ return nfs_invalf(fc, "NFS: invalid binary mount data");
}
#if IS_ENABLED(CONFIG_NFS_V4)
@@ -1146,6 +1181,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ int ret;
char *c;
if (!data) {
@@ -1218,9 +1254,9 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
ctx->acdirmin = data->acdirmin;
ctx->acdirmax = data->acdirmax;
ctx->nfs_server.protocol = data->proto;
- nfs_validate_transport_protocol(ctx);
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
done:
ctx->skip_reconfig_option_check = true;
return 0;
@@ -1231,9 +1267,6 @@ out_inval_auth:
out_no_address:
return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
-
-out_invalid_transport_udp:
- return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
}
#endif
@@ -1298,6 +1331,10 @@ static int nfs_fs_context_validate(struct fs_context *fc)
if (!nfs_verify_server_address(sap))
goto out_no_address;
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
+
if (ctx->version == 4) {
if (IS_ENABLED(CONFIG_NFS_V4)) {
if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
@@ -1306,9 +1343,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
port = NFS_PORT;
max_namelen = NFS4_MAXNAMLEN;
max_pathlen = NFS4_MAXPATHLEN;
- nfs_validate_transport_protocol(ctx);
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL |
NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK |
NFS_MOUNT_LOCAL_FCNTL);
@@ -1317,10 +1351,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
}
} else {
nfs_set_mount_transport_protocol(ctx);
-#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
-#endif
if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
port = NFS_RDMA_PORT;
}
@@ -1354,8 +1384,6 @@ out_no_device_name:
out_v4_not_compiled:
nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
return -EPROTONOSUPPORT;
-out_invalid_transport_udp:
- return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
out_no_address:
return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
out_mountproto_mismatch:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5a8854de0c19..529c4099f482 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -164,34 +164,19 @@ static int nfs_attribute_timeout(struct inode *inode)
return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
-static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+static bool nfs_check_cache_flags_invalid(struct inode *inode,
+ unsigned long flags)
{
unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
- /* Special case for the pagecache or access cache */
- if (flags == NFS_INO_REVAL_PAGECACHE &&
- !(cache_validity & NFS_INO_REVAL_FORCED))
- return false;
return (cache_validity & flags) != 0;
}
-static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
-{
- unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-
- if ((cache_validity & flags) != 0)
- return true;
- if (nfs_attribute_timeout(inode))
- return true;
- return false;
-}
-
bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
{
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
- return nfs_check_cache_invalid_delegated(inode, flags);
-
- return nfs_check_cache_invalid_not_delegated(inode, flags);
+ if (nfs_check_cache_flags_invalid(inode, flags))
+ return true;
+ return nfs_attribute_cache_expired(inode);
}
EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
@@ -214,20 +199,21 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
if (have_delegation) {
if (!(flags & NFS_INO_REVAL_FORCED))
- flags &= ~NFS_INO_INVALID_OTHER;
- flags &= ~(NFS_INO_INVALID_CHANGE
- | NFS_INO_INVALID_SIZE
- | NFS_INO_REVAL_PAGECACHE
- | NFS_INO_INVALID_XATTR);
- }
+ flags &= ~(NFS_INO_INVALID_MODE |
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_XATTR);
+ flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
+ } else if (flags & NFS_INO_REVAL_PAGECACHE)
+ flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
if (!nfs_has_xattr_cache(nfsi))
flags &= ~NFS_INO_INVALID_XATTR;
+ if (flags & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
if (inode->i_mapping->nrpages == 0)
flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
+ flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
nfsi->cache_validity |= flags;
- if (flags & NFS_INO_INVALID_DATA)
- nfs_fscache_invalidate(inode);
}
EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -452,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
.fattr = fattr
};
struct inode *inode = ERR_PTR(-ENOENT);
+ u64 fattr_supported = NFS_SB(sb)->fattr_valid;
unsigned long hash;
nfs_attr_check_mountpoint(sb, fattr);
@@ -484,8 +471,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_mode = fattr->mode;
nfsi->cache_validity = 0;
if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
- && nfs_server_capable(inode, NFS_CAP_MODE))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ && (fattr_supported & NFS_ATTR_FATTR_MODE))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -530,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
- else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
- else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
- else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
inode_set_iversion_raw(inode, fattr->change_attr);
@@ -550,29 +537,31 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
set_nlink(inode, fattr->nlink);
- else if (nfs_server_capable(inode, NFS_CAP_NLINK))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
- else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
- else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
if (nfs_server_capable(inode, NFS_CAP_XATTR))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
+ fattr->size != 0)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
-
- if (nfsi->cache_validity != 0)
- nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+ } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED &&
+ fattr->size != 0)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
nfs_setsecurity(inode, fattr, label);
@@ -634,8 +623,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Optimization: if the end result is no change, don't RPC */
- attr->ia_valid &= NFS_VALID_ATTRS;
- if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+ if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
trace_nfs_setattr_enter(inode);
@@ -710,12 +698,20 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
spin_lock(&inode->i_lock);
NFS_I(inode)->attr_gencount = fattr->gencount;
if ((attr->ia_valid & ATTR_SIZE) != 0) {
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME |
+ NFS_INO_INVALID_BLOCKS);
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME;
+ if ((attr->ia_valid & ATTR_KILL_SUID) != 0 &&
+ inode->i_mode & S_ISUID)
+ inode->i_mode &= ~S_ISUID;
+ if ((attr->ia_valid & ATTR_KILL_SGID) != 0 &&
+ (inode->i_mode & (S_ISGID | S_IXGRP)) ==
+ (S_ISGID | S_IXGRP))
+ inode->i_mode &= ~S_ISGID;
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -793,14 +789,28 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
dput(parent);
}
-static bool nfs_need_revalidate_inode(struct inode *inode)
+static u32 nfs_get_valid_attrmask(struct inode *inode)
{
- if (NFS_I(inode)->cache_validity &
- (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
- return true;
- if (nfs_attribute_cache_expired(inode))
- return true;
- return false;
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ u32 reply_mask = STATX_INO | STATX_TYPE;
+
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ reply_mask |= STATX_ATIME;
+ if (!(cache_validity & NFS_INO_INVALID_CTIME))
+ reply_mask |= STATX_CTIME;
+ if (!(cache_validity & NFS_INO_INVALID_MTIME))
+ reply_mask |= STATX_MTIME;
+ if (!(cache_validity & NFS_INO_INVALID_SIZE))
+ reply_mask |= STATX_SIZE;
+ if (!(cache_validity & NFS_INO_INVALID_NLINK))
+ reply_mask |= STAT