summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c177
-rw-r--r--fs/ext4/extents_status.c35
-rw-r--r--fs/ext4/fast_commit.c460
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c510
-rw-r--r--fs/ext4/ioctl.c16
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c11
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/orphan.c13
-rw-r--r--fs/ext4/readpage.c28
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c84
-rw-r--r--fs/ext4/xattr.c10
20 files changed, 1040 insertions, 449 deletions
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index a4dbaccee6e7..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
@@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
@@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a20e9cd7184..18373de980f2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -256,9 +256,19 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_DELAYED)
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -706,9 +716,6 @@ enum {
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -720,11 +727,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -738,6 +757,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -1061,16 +1087,16 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
-
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1754,7 +1780,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
@@ -1913,6 +1939,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
@@ -2295,10 +2322,12 @@ static inline int ext4_emergency_state(struct super_block *sb)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
/*
* Default values for superblock update
@@ -2487,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
return crc32c(crc, address, length);
}
@@ -2922,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
@@ -2973,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2993,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+bool ext4_should_enable_large_folio(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3039,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
@@ -3050,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3119,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
-extern __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3378,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3710,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3847,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
- return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 135e278c832e..b3e9b7bd7978 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !test_opt(inode->i_sb, DELALLOC) &&
+ !mapping_large_folio_support(inode->i_mapping))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 3221714d9901..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c616a16a9f36..b543a46fc809 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1604,7 +1605,8 @@ got_index:
ix++;
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1612,7 +1614,7 @@ got_index:
put_bh(bh);
}
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2851,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2918,7 +2922,7 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, flags);
if (err < 0)
goto out;
if (pblk) {
@@ -2994,8 +2998,7 @@ again:
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode, path[i].p_idx,
- depth - i - 1,
- EXT4_EX_NOCACHE);
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
if (err < 0)
goto out;
@@ -4433,6 +4437,20 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4781,6 +4799,93 @@ out_inode_lock:
}
/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+ }
+
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
+
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
+ }
+
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
+
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
+
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
* This function convert a range of blocks to written extents
* The caller of this function will pass the start offset and the size.
* all unwritten extents within this range will be converted to
@@ -4819,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4931,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -4956,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -4970,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -4999,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -5328,6 +5441,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
end_lblk = (offset + len) >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
@@ -5429,6 +5544,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
len_lblk = len >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d1401d4a5513..31dc0496f8d0 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da4263a14a20..42bee1d4f9f9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
* -----------------
@@ -49,19 +50,27 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
- * All the inode updates must call ext4_fc_start_update() before starting an
- * update. If such an ongoing update is present, fast commit waits for it to
- * complete. The completion of such an update is marked by
- * ext4_fc_stop_update().
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
* -------------------------
@@ -142,6 +151,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +172,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
@@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
}
static bool ext4_fc_disabled(struct super_block *sb)
@@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb)
}
/*
- * Inform Ext4's fast about start of an inode update
- *
- * This function is called by the high level call VFS callbacks before
- * performing any inode update. This function blocks if there's an ongoing
- * fast commit on the inode in question.
- */
-void ext4_fc_start_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
-restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list))
- goto out;
-
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-out:
- atomic_inc(&ei->i_fc_updates);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-
-/*
- * Stop inode update and wake up waiting fast commits if any.
- */
-void ext4_fc_stop_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
- if (atomic_dec_and_test(&ei->i_fc_updates))
- wake_up_all(&ei->i_fc_wait);
-}
-
-/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
*/
@@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
+ wait_queue_head_t *wq;
if (ext4_fc_disabled(inode->i_sb))
return;
-restart:
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ schedule();
+ mutex_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(wq, &wait.wq_entry);
}
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ list_del_init(&ei->i_fc_list);
/*
* Since this inode is getting removed, let's also remove all FC
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
@@ -320,12 +298,10 @@ restart:
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}
/*
@@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -385,7 +361,7 @@ static int ext4_fc_track_template(
int ret;
tid = handle->h_transaction->t_tid;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -393,19 +369,18 @@ static int ext4_fc_track_template(
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(handle, inode, args, update);
- mutex_unlock(&ei->i_fc_lock);
-
+ spin_unlock(&ei->i_fc_lock);
if (!enqueue)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
if (IS_ENCRYPTED(dir)) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -EOPNOTSUPP;
}
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
node->fcd_ino = inode->i_ino;
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
- spin_lock(&sbi->s_fc_lock);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- spin_unlock(&sbi->s_fc_lock);
- mutex_lock(&ei->i_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return 0;
}
@@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem when we go to sleep since
+ * the commit path needs to grab the lock while committing the inode.
+ */
+ lockdep_assert_not_held(&ei->i_data_sem);
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
+ /*
+ * From this point on, this inode will not be committed either
+ * by fast or full commit as long as the handle is open.
+ */
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
@@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
tl.fc_len = cpu_to_le16(remaining);
memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
- *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+ *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
ext4_fc_submit_bh(sb, false);
@@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
- crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
@@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
struct ext4_extent *ex;
int ret;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
return 0;
}
old_blk_size = ei->i_fc_lblk_start;
new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
ext4_debug("will try writing %d to %d for inode %ld\n",
@@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
map.m_len = new_blk_size - cur_lblk_off + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ ret = ext4_map_blocks(NULL, inode, &map,
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE);
if (ret < 0)
return -ECANCELED;
@@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
}
-/* Submit data for all the fast commit inodes */
-static int ext4_fc_submit_inode_data_all(journal_t *journal)
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
- spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
- return ret;
-}
-
-/* Wait for completion of data for all the fast commit inodes */
-static int ext4_fc_wait_inode_data_all(journal_t *journal)
-{
- struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *pos, *n;
- int ret = 0;
-
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- if (!ext4_test_inode_state(&pos->vfs_inode,
- EXT4_STATE_FC_COMMITTING))
- continue;
- spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_wait_inode_data(journal, pos->jinode);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_wait_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
return 0;
}
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
-__acquires(&sbi->s_fc_lock)
-__releases(&sbi->s_fc_lock)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock)
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
- spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
- spin_lock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
continue;
}
/*
* With fcd_dilist we need not loop in sbi->s_fc_q to get the
- * corresponding inode pointer
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
*/
- WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
- spin_unlock(&sbi->s_fc_lock);
-
/*
* We first write the inode and then the create dirent. This
* allows the recovery code to create an unnamed inode first
@@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock)
*/
ret = ext4_fc_write_inode(inode, crc);
if (ret)
- goto lock_and_exit;
-
+ return ret;
ret = ext4_fc_write_inode_data(inode, crc);
if (ret)
- goto lock_and_exit;
-
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
-
- spin_lock(&sbi->s_fc_lock);
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
}
return 0;
-lock_and_exit:
- spin_lock(&sbi->s_fc_lock);
- return ret;
}
static int ext4_fc_perform_commit(journal_t *journal)
@@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
- ret = ext4_fc_submit_inode_data_all(journal);
- if (ret)
- return ret;
+ /*
+ * Step 1: Mark all inodes on s_fc_q[MAIN] with
+ * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+ * freed until the data flush is over.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /* Step 2: Flush data for all the eligible inodes. */
+ ret = ext4_fc_flush_data(journal);
- ret = ext4_fc_wait_inode_data_all(journal);
+ /*
+ * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+ * any error from step 2. This ensures that waiters waiting on
+ * EXT4_STATE_FC_FLUSHING_DATA can resume.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ }
+
+ /*
+ * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+ * the waiter checks the bit. Pairs with implicit barrier in
+ * prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /*
+ * If we encountered error in Step 2, return it now after clearing
+ * EXT4_STATE_FC_FLUSHING_DATA bit.
+ */
if (ret)
return ret;
+
+ /* Step 4: Mark all inodes as being committed. */
+ jbd2_journal_lock_updates(journal);
/*
- * If file system device is different from journal device, issue a cache
- * flush before we start writing fast commit blocks.
+ * The journal is now locked. No more handles can start and all the
+ * previous handles are now drained. We now mark the inodes on the
+ * commit queue as being committed.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
+ /*
+ * Step 5: If file system device is different from journal device,
+ * issue a cache flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
+ /* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
- * Add a head tag only if this is the first fast commit
- * in this TID.
+ * Step 6.1: Add a head tag only if this is the first fast
+ * commit in this TID.
*/
head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
head.fc_tid = cpu_to_le32(
@@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
}
- spin_lock(&sbi->s_fc_lock);
+ /* Step 6.2: Now write all the dentry updates. */
+ mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
- if (ret) {
- spin_unlock(&sbi->s_fc_lock);
+ if (ret)
goto out;
- }
+ /* Step 6.3: Now write all the changed inodes to disk. */
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
ret = ext4_fc_write_inode_data(inode, &crc);
if (ret)
goto out;
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
+ /* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
out:
+ mutex_unlock(&sbi->s_fc_lock);
blk_finish_plug(&plug);
return ret;
}
@@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int subtid = atomic_read(&sbi->s_fc_subtid);
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
+ int old_ioprio, journal_ioprio;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
@@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
trace_ext4_fc_commit_start(sb, commit_tid);
start_time = ktime_get();
+ old_ioprio = get_current_ioprio();
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
@@ -1228,6 +1241,15 @@ restart_fc:
goto fallback;
}
+ /*
+ * Now that we know that this thread is going to do a fast commit,
+ * elevate the priority to match that of the journal thread.
+ */
+ if (journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ else
+ journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+ set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
@@ -1242,6 +1264,7 @@ restart_fc:
}
atomic_inc(&sbi->s_fc_subtid);
ret = jbd2_fc_end_commit(journal);
+ set_task_ioprio(current, old_ioprio);
/*
* weight the commit time higher than the average time so we
* don't react too strongly to vast changes in the commit time
@@ -1251,6 +1274,7 @@ restart_fc:
return ret;
fallback:
+ set_task_ioprio(current, old_ioprio);
ret = jbd2_fc_end_commit_fallback(journal);
ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
@@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *iter, *iter_n;
+ struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
if (full && sbi->s_fc_bh)
@@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- list_del_init(&iter->i_fc_list);
- ext4_clear_inode_state(&iter->vfs_inode,
+ mutex_lock(&sbi->s_fc_lock);
+ while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+ struct ext4_inode_info,
+ i_fc_list);
+ list_del_init(&ei->i_fc_list);
+ ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, iter->i_sync_tid)) {
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (tid_geq(tid, ei->i_sync_tid)) {
+ ext4_fc_reset_inode(&ei->vfs_inode);
} else if (full) {
/*
* We are called after a full commit, inode has been
@@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
* time in that case (and tid doesn't increase so
* tid check above isn't reliable).
*/
- list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+ list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
}
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
#if (BITS_PER_LONG < 64)
- wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
- wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
}
@@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
fcd_list);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
- spin_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- spin_lock(&sbi->s_fc_lock);
}
list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
@@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
trace_ext4_fc_stats(sb);
}
@@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_INODE:
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
@@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
break;
}
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index beb078ee4811..21df81347147 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+
+ if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+ (iocb->ki_flags & IOCB_ATOMIC))
+ error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+ size);
+ else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
@@ -929,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e7ecc7c8a729..79aa3df8d019 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1288,10 +1288,9 @@ got:
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -1336,6 +1335,9 @@ got:
}
}
+ if (ext4_should_enable_large_folio(inode))
+ mapping_set_large_folios(inode->i_mapping);
+
ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2c9b762925c7..a1bbcdf40824 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -397,7 +397,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -601,6 +601,7 @@ retry:
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cdf01e60fa6d..be9a4cba35fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -142,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
@@ -416,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -462,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map)
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
+ unsigned int orig_mlen = map->m_len;
+ flags &= EXT4_EX_QUERY_FILTER;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval <= 0)
return retval;
@@ -484,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
WARN_ON(1);
}
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
- return retval;
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return retval;
+ }
+
+ return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -602,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -622,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
@@ -653,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -667,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- retval = ext4_map_query_blocks(handle, inode, map);
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -696,6 +801,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
+
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -1009,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
@@ -1027,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
@@ -1041,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_buffers(folio);
@@ -1152,6 +1263,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
unsigned from, to;
+ fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -1164,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
@@ -1184,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -1765,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
map->m_len = min_t(unsigned int, map->m_len,
@@ -1805,7 +1925,7 @@ found:
if (ext4_has_inline_data(inode))
retval = 0;
else
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval < 0 ? retval : 0;
@@ -1828,7 +1948,7 @@ add_delayed:
goto found;
}
} else if (!ext4_has_inline_data(inode)) {
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
return retval < 0 ? retval : 0;
@@ -1936,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
return err;
}
@@ -2159,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2170,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
+ lblk = folio->index << bpp_bits;
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2212,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2355,7 +2479,7 @@ update_disksize:
*/
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
@@ -2391,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
size_t len = folio_size(folio);
folio_clear_checked(folio);
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
@@ -2433,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
- int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2920,6 +3044,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
struct inode *inode = mapping->host;
+ fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -2945,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
if (ret < 0) {
@@ -3038,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
unsigned long end;
i_size_write(inode, new_i_size);
- end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ end = offset_in_folio(folio, new_i_size - 1);
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
@@ -3340,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3353,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3384,7 +3673,11 @@ retry:
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3398,6 +3691,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3408,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3421,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
if (flags & IOMAP_WRITE) {
/*
@@ -3431,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
- goto out;
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3449,6 +3772,16 @@ out:
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3690,9 +4023,7 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3707,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
if (!bh)
bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -4006,7 +4338,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t start_lblk, end_lblk;
- loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+ loff_t max_end = sb->s_maxbytes;
loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
@@ -4015,14 +4347,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
WARN_ON_ONCE(!inode_is_locked(inode));
+ /*
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
+ if (offset >= inode->i_size || offset >= max_end)
return 0;
/*
* If the hole extends beyond i_size, set the hole to end after
- * the page that contains i_size, and also make sure that the hole
- * within one block before last range.
+ * the page that contains i_size.
*/
if (end > inode->i_size)
end = round_up(inode->i_size, PAGE_SIZE);
@@ -4072,6 +4410,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (end_lblk > start_lblk) {
ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -4224,8 +4564,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4760,10 +5102,27 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
return 0;
error:
- ext4_error_inode(inode, function, line, 0, err_str);
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
return -EFSCORRUPTED;
}
+bool ext4_should_enable_large_folio(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_encrypt(sb))
+ return false;
+
+ return true;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -4781,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
- ino == le32_to_cpu(es->s_usr_quota_inum) ||
- ino == le32_to_cpu(es->s_grp_quota_inum) ||
- ino == le32_to_cpu(es->s_prj_quota_inum) ||
- ino == le32_to_cpu(es->s_orphan_file_inum))) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
@@ -4845,10 +5199,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
@@ -4916,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5086,6 +5440,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ if (ext4_should_enable_large_folio(inode))
+ mapping_set_large_folios(inode->i_mapping);
+
ret = check_igot_inode(inode, flags, function, line);
/*
* -ESTALE here means there is nothing inherently wrong with the inode,
@@ -5564,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
down_write(&EXT4_I(inode)->i_data_sem);
old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5577,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
else
EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5773,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5782,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and lead blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks + pextents;
+ groups = idxblocks;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -5801,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5821,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
int ext4_writepage_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
@@ -5895,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d17207386ead..5668a17458ae 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -143,7 +143,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
if (ext4_has_feature_metadata_csum(sb) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
unlock_buffer(bh);
@@ -151,7 +151,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
}
func(es, arg);
if (ext4_has_feature_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode)
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
/*
@@ -1505,8 +1505,14 @@ resizefs_out:
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
- return ext4_ext_precache(inode);
+ {
+ int ret;
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 3e26464b1425..51661570cf3b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 48649be64d6a..1f8493a56e8f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
unsigned int tmp_data_size, data_size, replaced_size;
int i, err2, jblocks, retries = 0;
int replaced_count = 0;
- int from = data_offset_in_page << orig_inode->i_blkbits;
+ int from;
int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
struct buffer_head *bh = NULL;
@@ -323,11 +323,6 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
-
- VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
- VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
- VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
-
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -360,6 +355,8 @@ again:
goto unlock_folios;
}
data_copy:
+ from = offset_in_folio(folio[0],
+ orig_blk_offset << orig_inode->i_blkbits);
*err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
if (*err)
goto unlock_folios;
@@ -390,7 +387,7 @@ data_copy:
if (!bh)
bh = create_empty_buffers(folio[0],
1 << orig_inode->i_blkbits, 0);
- for (i = 0; i < data_offset_in_page; i++)
+ for (i = 0; i < from >> orig_inode->i_blkbits; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e9712e64ec8f..a178ac229489 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
return cpu_to_le32(csum);
}
@@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int count_offset, int count, struct dx_tail *t)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
int size;
@@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(csum, (__u8 *)t, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index c66e0cb29bd4..7c7f792ad6ab 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
return 1;
ot = ext4_orphan_block_tail(sb, bh);
- calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
inodes_per_ob * sizeof(__u32));
return le32_to_cpu(ot->ob_checksum) == calculated;
}
@@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
- inodes_per_ob * sizeof(__u32));
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
ot = ext4_orphan_block_tail(sb, bh);
ot->ob_checksum = cpu_to_le32(csum);
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5d3a9dc9a32d..f329daf6e5c7 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
- unsigned int nr_pages = rac ? readahead_count(rac) : 1;
+ unsigned int nr_pages, folio_pages;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- for (; nr_pages; nr_pages--) {
+ nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio);
+ for (; nr_pages; nr_pages -= folio_pages) {
int fully_mapped = 1;
- unsigned first_hole = blocks_per_page;
+ unsigned int first_hole;
+ unsigned int blocks_per_folio;
if (rac)
folio = readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
prefetchw(&folio->flags);
if (folio_buffers(folio))
goto confused;
+ blocks_per_folio = folio_size(folio) >> blkbits;
+ first_hole = blocks_per_folio;
block_in_file = next_block =
(sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
@@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode,
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode,
* Then do more ext4_map_blocks() calls until we are
* done with this folio.
*/
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
@@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode,
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode,
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
}
}
- if (first_hole != blocks_per_page) {
+ if (first_hole != blocks_per_folio) {
folio_zero_segment(folio, first_hole << blkbits,
folio_size(folio));
if (first_hole == 0) {
@@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
- (first_hole != blocks_per_page)) {
+ (first_hole != blocks_per_folio)) {
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = first_block + blocks_per_page - 1;
+ last_block_in_bio = first_block + blocks_per_folio - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b7ff0d955f0d..050f26168d97 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1119,7 +1119,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
es->s_block_group_nr = cpu_to_le16(group);
if (ext4_has_feature_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 181934499624..a7f80ca01174 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -286,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-__le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+ csum = ext4_chksum(~0, (char *)es, offset);
return cpu_to_le32(csum);
}
@@ -304,7 +302,7 @@ static int ext4_superblock_csum_verify(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return 1;
- return es->s_checksum == ext4_superblock_csum(sb, es);
+ return es->s_checksum == ext4_superblock_csum(es);
}
void ext4_superblock_csum_set(struct super_block *sb)
@@ -314,7 +312,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
if (!ext4_has_feature_metadata_csum(sb))
return;
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -508,21 +506,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
ext4_maybe_update_superblock(sb);
}
-/*
- * This writepage callback for write_cache_pages()
- * takes care of a few cases after page cleaning.
- *
- * write_cache_pages() already checks for dirty pages
- * and calls clear_page_dirty_for_io(), which we want,
- * to write protect the pages.
- *
- * However, we may have to redirty a page (see below.)
- */
-static int ext4_journalled_writepage_callback(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
+static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
+ struct folio *folio)
{
- transaction_t *transaction = (transaction_t *) data;
struct buffer_head *bh, *head;
struct journal_head *jh;
@@ -543,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio,
*/
jh = bh2jh(bh);
if (buffer_dirty(bh) ||
- (jh && (jh->b_transaction != transaction ||
- jh->b_next_transaction))) {
- folio_redirty_for_writepage(wbc, folio);
- goto out;
- }
+ (jh && (jh->b_transaction != jinode->i_transaction ||
+ jh->b_next_transaction)))
+ return true;
} while ((bh = bh->b_this_page) != head);
-out:
- return AOP_WRITEPAGE_ACTIVATE;
+ return false;
}
static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -563,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
.range_start = jinode->i_dirty_start,
.range_end = jinode->i_dirty_end,
};
+ struct folio *folio = NULL;
+ int error;
- return write_cache_pages(mapping, &wbc,
- ext4_journalled_writepage_callback,
- jinode->i_transaction);
+ /*
+ * writeback_iter() already checks for dirty pages and calls
+ * folio_clear_dirty_for_io(), which we want to write protect the
+ * folios.
+ *
+ * However, we may have to redirty a folio sometimes.
+ */
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (ext4_journalled_writepage_needs_redirty(jinode, folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ folio_unlock(folio);
+ }
+
+ return error;
}
static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -1415,7 +1411,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
- mutex_init(&ei->i_fc_lock);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1809,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
{}
};
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
@@ -3209,14 +3204,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__u32 csum32;
__u16 dummy_csum = 0;
- csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
sizeof(dummy_csum));
offset += sizeof(dummy_csum);
if (offset < sbi->s_desc_size)
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
@@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb)
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * With non-bigalloc filesystem awu will be based upon filesystem blocksize
+ * & bdev awu units.
+ * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
* @sb: super block
- * TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
+ unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
if (!bdev_can_atomic_write(bdev))
return;
@@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb)
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
- sbi->s_awu_max = min(sb->s_blocksize,
+ sbi->s_awu_max = min(clustersize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
@@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
+ mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
@@ -4644,7 +4642,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
else if (ext4_has_feature_metadata_csum(sb) ||
ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
}
@@ -5255,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
errno = -EFSCORRUPTED;
goto out_bh;
@@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
ctx->journal_ioprio =
sbi->s_journal->j_task->io_context->ioprio;
else
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7ab8f2e8e815..8d15acbacc20 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
__u32 dummy_csum = 0;
int offset = offsetof(struct ext4_xattr_header, h_checksum);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
offset += sizeof(dummy_csum);
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ csum = ext4_chksum(csum, (__u8 *)hdr + offset,
EXT4_BLOCK_SIZE(inode->i_sb) - offset);
return cpu_to_le32(csum);
@@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
- return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+ return ext4_chksum(sbi->s_csum_seed, buffer, size);
}
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)