diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/balloc.c | 4 | ||||
-rw-r--r-- | fs/ext4/bitmap.c | 16 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 5 | ||||
-rw-r--r-- | fs/ext4/dir.c | 7 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 183 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 15 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 117 | ||||
-rw-r--r-- | fs/ext4/extents.c | 700 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 36 | ||||
-rw-r--r-- | fs/ext4/fast_commit.c | 460 | ||||
-rw-r--r-- | fs/ext4/file.c | 44 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 12 | ||||
-rw-r--r-- | fs/ext4/hash.c | 2 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 20 | ||||
-rw-r--r-- | fs/ext4/inline.c | 210 | ||||
-rw-r--r-- | fs/ext4/inode.c | 883 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 29 | ||||
-rw-r--r-- | fs/ext4/mballoc-test.c | 2 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 29 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 8 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 13 | ||||
-rw-r--r-- | fs/ext4/namei.c | 139 | ||||
-rw-r--r-- | fs/ext4/orphan.c | 15 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 77 | ||||
-rw-r--r-- | fs/ext4/readpage.c | 28 | ||||
-rw-r--r-- | fs/ext4/resize.c | 6 | ||||
-rw-r--r-- | fs/ext4/super.c | 353 | ||||
-rw-r--r-- | fs/ext4/sysfs.c | 4 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 57 | ||||
-rw-r--r-- | fs/ext4/xattr.h | 10 |
30 files changed, 1970 insertions, 1514 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8042ad873808..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 2a135075468d..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -25,12 +25,12 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -48,11 +48,11 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -67,11 +67,11 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -89,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 87ee3a17bd29..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 02d47a64e8d1..d4164c507a90 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); - bool has_csum = ext4_has_metadata_csum(dir->i_sb); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; /* Can we just clear INDEX flag to ignore htree information? */ - if (!ext4_has_metadata_csum(sb)) { + if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e7de7eaa374..18373de980f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -256,9 +256,19 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_DELAYED) + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -278,7 +288,10 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -367,6 +380,8 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -701,9 +716,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -715,11 +727,23 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the @@ -733,6 +757,13 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks @@ -1056,15 +1087,16 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not @@ -1097,8 +1129,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. @@ -1141,6 +1171,7 @@ struct ext4_inode_info { /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif + spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; @@ -1151,8 +1182,6 @@ struct ext4_inode_info { struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - spinlock_t i_block_reservation_lock; - /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. @@ -1606,6 +1635,8 @@ struct ext4_sb_info { unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1749,7 +1780,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; @@ -1821,7 +1852,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -1907,6 +1939,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; @@ -2232,15 +2265,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ -#define EXT4_FLAGS_RESIZING 0 -#define EXT4_FLAGS_SHUTDOWN 1 -#define EXT4_FLAGS_BDEV_IS_DAX 2 +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } +static inline int ext4_emergency_ro(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; +} + /* * Default values for user and/or group using reserved blocks */ @@ -2272,10 +2322,19 @@ static inline int ext4_forced_shutdown(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + + +/* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + /* * Minimum number of groups in a flexgroup before we separate out @@ -2457,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } @@ -2810,8 +2868,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); @@ -2893,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); @@ -2944,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr) void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, @@ -2964,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3001,6 +3058,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -3008,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); @@ -3019,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3088,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3259,14 +3330,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_metadata_csum(struct super_block *sb) -{ - return ext4_has_feature_metadata_csum(sb); -} - static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ @@ -3351,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3546,11 +3620,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); @@ -3683,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, @@ -3785,34 +3861,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - struct inode *inode = io_end->inode; - - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - } } extern const struct iomap_ops ext4_iomap_ops; @@ -3835,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) static inline bool ext4_inode_can_atomic_write(struct inode *inode) { - return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index da4a82456383..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ @@ -63,12 +64,14 @@ static void ext4_put_nojournal(handle_t *handle) */ static int ext4_journal_check_start(struct super_block *sb) { + int ret; journal_t *journal; might_sleep(); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; @@ -244,7 +247,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, } } else ext4_check_bdev_write_error(sb); - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, @@ -331,7 +335,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, err); return err; } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,90 +122,6 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void _ext4_journal_callback_add(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - /* Add the jce to transaction's private list */ - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -} - -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, jce); - spin_unlock(&sbi->s_md_lock); -} - - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - * Return true if object was successfully removed - */ -static inline bool ext4_journal_callback_try_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - bool deleted; - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - deleted = !list_empty(&jce->jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - return deleted; -} - int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -403,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } @@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a07a98a4b97a..b543a46fc809 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } @@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -77,7 +76,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1604,7 +1605,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1612,7 +1614,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = depth * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } @@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2851,8 +2856,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2918,7 +2922,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2994,8 +2998,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); if (err < 0) goto out; @@ -4433,6 +4437,20 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, @@ -4568,131 +4586,65 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. Here, start and partial_begin are inclusive, end and - * partial_end are exclusive. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4705,27 +4657,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4739,12 +4733,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4764,74 +4754,135 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; + goto out_inode_lock; - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - if (mode & FALLOC_FL_ZERO_RANGE) { + /* + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. + */ + filemap_invalidate_lock(mapping); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_invalidate_lock; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = ext4_punch_hole(file, offset, len); + else if (mode & FALLOC_FL_COLLAPSE_RANGE) + ret = ext4_collapse_range(file, offset, len); + else if (mode & FALLOC_FL_INSERT_RANGE) + ret = ext4_insert_range(file, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) ret = ext4_zero_range(file, offset, len, mode); - goto exit; - } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; + else + ret = -EOPNOTSUPP; + +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: + inode_unlock(inode); + return ret; +} +/* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - inode_lock(inode); + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } - /* - * We only support preallocation for extent-based files only - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + ret2 = ext4_mark_inode_dirty(handle, inode); - ret = file_modified(file); - if (ret) - goto out; + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: - inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: - return ret; + return ret > 0 ? ret2 : ret; } /* @@ -4873,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " @@ -4985,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -5010,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -5024,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5053,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; @@ -5332,109 +5391,74 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; + + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5442,18 +5466,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5473,100 +5495,65 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } depth = ext_depth(inode); @@ -5576,16 +5563,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - offset_lblk, split_flag, + start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -5594,32 +5581,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } } ext4_free_ext_path(path); - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ae29832aab1e..31dc0496f8d0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * @@ -1551,7 +1582,6 @@ retry: ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb) } /* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - -/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -320,12 +298,10 @@ restart: list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -385,7 +361,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,19 +369,18 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } @@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", @@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1228,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1242,6 +1264,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1251,6 +1274,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; @@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a5205149adba..21df81347147 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; @@ -688,10 +693,12 @@ out: static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + int ret; struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) @@ -700,7 +707,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); - int ret; if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) @@ -756,9 +762,6 @@ retry: return VM_FAULT_SIGBUS; } } else { - result = filemap_fsnotify_fault(vmf); - if (unlikely(result)) - return result; filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); @@ -803,11 +806,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; /* * We don't support synchronous mappings for non-DAX files and @@ -838,7 +846,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); @@ -881,8 +890,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) @@ -921,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index b40d3b29f7e5..e476c6de3074 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); - if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_ext4_flags value */ - smp_rmb(); - if (ext4_forced_shutdown(inode->i_sb)) - ret = -EROFS; + if (sb_rdonly(inode->i_sb)) goto out; - } if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index deabe29da7fb..33cd5b6b02d5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + buff = kzalloc(PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21d228073d79..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it - * must have been written out. + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. */ goto out; @@ -951,8 +952,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return ERR_PTR(-EIO); + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1282,14 +1284,13 @@ got: inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -1298,7 +1299,7 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1334,6 +1335,9 @@ got: } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..a1bbcdf40824 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -20,6 +20,11 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) @@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -392,7 +397,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -596,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; @@ -637,7 +643,7 @@ retry: goto retry; if (folio) - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); @@ -653,91 +659,109 @@ out_nofolio: } /* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop) +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; - goto out; + goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_stop_journal; - /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out; + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); - goto out; + goto out_stop_journal; } - *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; - folio_unlock(folio); - folio_put(folio); - goto out_up_read; + goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) { - folio_unlock(folio); - folio_put(folio); - goto out_up_read; - } + if (ret < 0) + goto out_release_folio; } - ret = 1; - handle = NULL; -out_up_read: + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle && (ret != 1)) - ext4_journal_stop(handle); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: brelse(iloc.bh); return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, inode); +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, @@ -881,94 +905,6 @@ out: return ret; } -/* - * Prepare the write for the inline data. - * If the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata) -{ - int ret; - handle_t *handle; - struct folio *folio; - struct ext4_iloc iloc; - int retries = 0; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - fsdata); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto out_journal; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) - goto out_release_page; - } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out_release_page; - - up_read(&EXT4_I(inode)->xattr_sem); - *foliop = folio; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - folio_unlock(folio); - folio_put(folio); -out_journal: - ext4_journal_stop(handle); -out: - brelse(iloc.bh); - return ret; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1012,7 +948,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; @@ -1146,7 +1082,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..be9a4cba35fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -57,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -93,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -114,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -141,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -181,6 +177,8 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); + dax_break_layout_final(inode); + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { @@ -383,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -412,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -458,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -480,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -598,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -618,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -649,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -663,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -692,6 +801,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -751,7 +862,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -1005,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } @@ -1023,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1037,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1148,9 +1263,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* @@ -1159,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1179,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1760,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -1800,7 +1925,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1823,7 +1948,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -1931,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2154,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2165,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2207,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2225,7 +2354,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2273,7 +2402,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2350,7 +2479,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2386,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) @@ -2428,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2599,10 +2728,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2817,8 +2945,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2858,8 +2987,9 @@ static int ext4_dax_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); @@ -2914,9 +3044,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; @@ -2929,8 +3061,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - foliop, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -2938,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3031,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -3290,6 +3426,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else @@ -3329,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3342,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3373,7 +3673,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3387,6 +3691,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3397,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3410,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3420,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3438,6 +3772,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3679,9 +4023,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3696,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3902,6 +4245,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3911,24 +4316,10 @@ static void ext4_wait_dax_page(struct inode *inode) int ext4_break_layouts(struct inode *inode) { - struct page *page; - int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; - do { - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - error = ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, - TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(inode)); - } while (error == 0); - - return error; + return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* @@ -3946,91 +4337,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); /* - * Write out all dirty pages to avoid race conditions - * Then release them. + * For indirect-block based inodes, make sure that the hole within + * one block before last range. */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; + if (offset >= inode->i_size || offset >= max_end) + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - ret = file_modified(file); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -4040,54 +4396,53 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { - ext4_lblk_t hole_len = stop_block - first_block; + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, hole_len); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } - ext4_es_insert_extent(inode, first_block, hole_len, ~0, + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4209,8 +4564,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4674,6 +5031,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4705,22 +5067,60 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + return 0; + +error: + ext4_error_inode(inode, function, line, 0, "%s", err_str); + return -EFSCORRUPTED; +} + +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4732,7 +5132,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4741,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4761,10 +5155,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -4800,15 +5194,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4876,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4887,7 +5281,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -5007,8 +5402,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { @@ -5037,13 +5440,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; - } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); + } + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; @@ -5228,8 +5642,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5351,8 +5766,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5464,7 +5880,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) - goto err_out; + goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); @@ -5505,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5518,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5633,7 +6050,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, awu_max = sbi->s_awu_max; } - generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; @@ -5714,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5723,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5742,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5762,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); @@ -5796,9 +6210,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5822,8 +6237,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { @@ -5834,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7b9ce71c1c81..5668a17458ae 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -142,16 +142,16 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); - if (ext4_has_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + if (ext4_has_feature_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); goto out_bh; } func(es, arg); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -351,11 +351,11 @@ void ext4_reset_inode_seed(struct inode *inode) __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* @@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp, * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; @@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(idmap, inode)) return -EPERM; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -1504,8 +1505,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; @@ -1705,7 +1712,7 @@ int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index bb2a223b207c..d634c12f1984 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); @@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b25a27c86696..1e98c5be4e0a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -187,7 +187,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -209,7 +209,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; - } sg; + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) @@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); @@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ - memcpy(&sg, grinfo, i); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); + sg->bb_counters[i] : 0); seq_puts(seq, " ]"); - if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; @@ -5653,7 +5651,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5687,7 +5685,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" @@ -6644,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d64c04ed061a..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 898443e98efc..1f8493a56e8f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; + int from; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -323,11 +323,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -360,6 +355,8 @@ again: goto unlock_folios; } data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; @@ -390,7 +387,7 @@ data_copy: if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < data_offset_in_page; i++) + for (i = 0; i < from >> orig_inode->i_blkbits; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); @@ -399,7 +396,7 @@ data_copy: bh = bh->b_this_page; } - block_commit_write(&folio[0]->page, from, from + replaced_size); + block_commit_write(folio[0], from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 536d56d15072..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, brelse(bh); return ERR_PTR(-EFSCORRUPTED); } - if (!ext4_has_metadata_csum(inode->i_sb) || + if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -291,36 +291,6 @@ struct dx_tail { __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(struct ext4_filename *fname, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct buffer_head *bh, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, - char *to, struct dx_map_entry *offsets, - int count, unsigned int blocksize); -static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, - unsigned int blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); @@ -376,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -398,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); @@ -419,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); @@ -472,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -480,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -494,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -523,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -612,7 +580,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -622,7 +590,7 @@ static inline unsigned dx_node_limit(struct inode *dir) unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1076,7 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; - int csum = ext4_has_metadata_csum(dir->i_sb); + int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1320,7 +1288,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { @@ -1462,7 +1430,8 @@ static bool ext4_match(struct inode *parent, * sure cf_name was properly initialized before * considering the calculated hash. */ - if (IS_ENCRYPTED(parent) && fname->cf_name.name && + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; @@ -1595,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else goto cleanup_and_exit; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); @@ -1945,7 +1919,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); @@ -1995,7 +1969,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; @@ -2060,8 +2034,7 @@ out: return ERR_PTR(err); } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) @@ -2143,11 +2116,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, int csum_size = 0; int err, err2; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; @@ -2252,7 +2225,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct fake_dirent *fde; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -2396,7 +2369,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -2427,7 +2400,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; @@ -2577,8 +2550,10 @@ again: BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); @@ -2589,8 +2564,10 @@ again: err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); @@ -2609,8 +2586,10 @@ again: dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); @@ -2635,8 +2614,10 @@ again: "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; @@ -2733,7 +2714,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2973,7 +2954,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3004,19 +2985,19 @@ out: return err; } -static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; + return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -3066,7 +3047,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return ERR_PTR(err); } /* @@ -3151,8 +3132,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -3309,8 +3291,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; trace_ext4_unlink_enter(dir, dentry); /* @@ -3376,8 +3359,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -4199,8 +4183,9 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) - return -EIO; + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -537,13 +537,13 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 69b8a7221a2b..179e54f3a3b6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); + if (io_end->flag & EXT4_IO_END_FAILED || + (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec))) { + ext4_add_complete_io(io_end); return; } - ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each @@ -522,7 +549,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5d3a9dc9a32d..f329daf6e5c7 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode, int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; @@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode, map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; @@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode, /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { @@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = first_block + blocks_per_page - 1; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 72f77f78ae8d..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1118,8 +1118,8 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); } /* @@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a50e5c31b937..a7f80ca01174 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, @@ -287,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -302,20 +299,20 @@ __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -448,9 +445,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ -#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. @@ -458,8 +452,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: - * 1. More than an hour has passed since the last superblock update, and - * 2. More than 16MB have been written since the last superblock update. + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. * * @sb: The superblock */ @@ -473,14 +469,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb) __u64 lifetime_write_kbytes; __u64 diff_size; - if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || - !journal || (journal->j_flags & JBD2_UNMOUNT)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); - if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + @@ -495,49 +492,23 @@ static void ext4_maybe_update_superblock(struct super_block *sb) */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); - if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); - - spin_lock(&sbi->s_md_lock); - while (!list_empty(&txn->t_private_list)) { - jce = list_entry(txn->t_private_list.next, - struct ext4_journal_cb_entry, jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -558,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -578,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; + + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -707,11 +688,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!continue_fs && !sb_rdonly(sb)) { - set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -719,9 +697,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); @@ -737,17 +719,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_id); } - if (sb_rdonly(sb) || continue_fs) + if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem - * modifications. We don't set SB_RDONLY because that requires - * sb->s_umount semaphore and setting it without proper remount - * procedure is confusing code such as freeze_super() leading to - * deadlocks and other problems. + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. */ + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) @@ -765,7 +747,8 @@ static void update_super_work(struct work_struct *work) * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ - if (!sb_rdonly(sbi->s_sb) && journal) { + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; @@ -819,7 +802,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -844,7 +827,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -879,7 +862,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -959,7 +942,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -1053,7 +1036,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -1306,18 +1289,17 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -1325,13 +1307,14 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!sb_rdonly(sb) && !aborted) { - ext4_clear_feature_journal_needs_recovery(sb); - ext4_clear_feature_orphan_present(sb); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (!sb_rdonly(sb)) + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } ext4_commit_super(sb); + } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); @@ -1426,10 +1409,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1823,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -2785,6 +2766,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -3038,6 +3026,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3205,19 +3199,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sbi->s_sb)) { + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -3693,7 +3687,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; @@ -3998,7 +3993,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (sb_rdonly(sb) || + if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; @@ -4061,7 +4056,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -4349,7 +4344,7 @@ static void ext4_set_def_opts(struct super_block *sb, if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) @@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb) /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block - * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; @@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb) sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); - sbi->s_awu_max = min(sb->s_blocksize, + sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { @@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; @@ -4642,8 +4640,9 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } @@ -4973,10 +4972,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_sb_upd_work before destroying the journal. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5013,6 +5009,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb) return 0; } +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5239,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -5263,6 +5277,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is @@ -5404,30 +5420,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "suppressed and not mounted read-only"); goto failed_mount3a; } else { + const char *journal_option; + /* Nojournal mode, all journal mount options are illegal */ - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); goto failed_mount3a; } - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "commit=%lu, fs mounted w/o journal", - sbi->s_commit_interval / HZ); - goto failed_mount3a; - } - if (EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "data=, fs mounted w/o journal"); - goto failed_mount3a; - } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5616,9 +5619,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount9; } - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5665,10 +5670,7 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_sb_upd_work before journal destroy. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5676,7 +5678,7 @@ failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); - del_timer_sync(&sbi->s_err_report); + timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: #if IS_ENABLED(CONFIG_UNICODE) @@ -5773,10 +5775,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. @@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; @@ -5973,7 +5971,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); @@ -6090,8 +6088,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6109,7 +6106,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6336,8 +6333,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6419,7 +6417,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } @@ -6575,7 +6573,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_forced_shutdown(sb)) { + if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } @@ -6780,6 +6778,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6791,9 +6790,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6817,22 +6816,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6935,12 +6941,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6951,6 +6970,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7288,7 +7311,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); @@ -7381,12 +7404,9 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - static int __init ext4_init_fs(void) { - int i, err; + int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; @@ -7394,9 +7414,6 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) - init_waitqueue_head(&ext4__ioend_wq[i]); - err = ext4_init_es(); if (err) return err; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ddb54608ca2e..987bd00f916a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7647e9f6e190..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); @@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } @@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static inline int +int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { @@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, function, line); } -#define xattr_check_inode(inode, header, end) \ - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) - static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) @@ -351,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) @@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; + end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) @@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; - void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); -cleanup: brelse(iloc.bh); return error; } @@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; - void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); @@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - ret = xattr_check_inode(inode, header, end); - if (ret) - goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = xattr_check_inode(inode, header, is->s.end); - if (error) - return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); @@ -2786,14 +2775,10 @@ retry: */ base = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; - ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b25c2d7b5f99..1fedf44d4fb6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -67,6 +67,9 @@ struct ext4_xattr_entry { ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* @@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); |