summaryrefslogtreecommitdiff
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c883
1 files changed, 650 insertions, 233 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c54ae5fcbd4..be9a4cba35fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -57,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -93,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -114,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -141,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
@@ -181,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ dax_break_layout_final(inode);
+
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
@@ -383,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -412,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -458,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map)
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
+ unsigned int orig_mlen = map->m_len;
+ flags &= EXT4_EX_QUERY_FILTER;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval <= 0)
return retval;
@@ -480,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
WARN_ON(1);
}
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
- return retval;
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return retval;
+ }
+
+ return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -598,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -618,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
@@ -649,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -663,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- retval = ext4_map_query_blocks(handle, inode, map);
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -692,6 +801,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
+
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -751,7 +862,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
- if (!bh->b_page) {
+ if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@@ -1005,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
@@ -1023,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
@@ -1037,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_buffers(folio);
@@ -1148,9 +1263,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
unsigned from, to;
+ fgf_t fgp = FGP_WRITEBEGIN;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_write_begin(inode, pos, len);
/*
@@ -1159,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
@@ -1179,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -1760,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
map->m_len = min_t(unsigned int, map->m_len,
@@ -1800,7 +1925,7 @@ found:
if (ext4_has_inline_data(inode))
retval = 0;
else
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval < 0 ? retval : 0;
@@ -1823,7 +1948,7 @@ add_delayed:
goto found;
}
} else if (!ext4_has_inline_data(inode)) {
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
return retval < 0 ? retval : 0;
@@ -1931,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
return err;
}
@@ -2154,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2165,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
+ lblk = folio->index << bpp_bits;
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2207,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2225,7 +2354,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@@ -2273,7 +2402,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2350,7 +2479,7 @@ update_disksize:
*/
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
@@ -2386,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
size_t len = folio_size(folio);
folio_clear_checked(folio);
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
@@ -2428,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
- int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2599,10 +2728,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
- ret = -EROFS;
+ ret = ext4_emergency_state(mapping->host->i_sb);
+ if (unlikely(ret))
goto out_writepages;
- }
/*
* If we have inline data and arrive here, it means that
@@ -2817,8 +2945,9 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
@@ -2858,8 +2987,9 @@ static int ext4_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
@@ -2914,9 +3044,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
struct inode *inode = mapping->host;
+ fgf_t fgp = FGP_WRITEBEGIN;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
index = pos >> PAGE_SHIFT;
@@ -2929,8 +3061,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- foliop, fsdata);
+ ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
@@ -2938,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
if (ret < 0) {
@@ -3031,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
unsigned long end;
i_size_write(inode, new_i_size);
- end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ end = offset_in_folio(folio, new_i_size - 1);
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
@@ -3290,6 +3426,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
@@ -3329,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3342,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3373,7 +3673,11 @@ retry:
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3387,6 +3691,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3397,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3410,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
if (flags & IOMAP_WRITE) {
/*
@@ -3420,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
- goto out;
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3438,6 +3772,16 @@ out:
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3679,9 +4023,7 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3696,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
if (!bh)
bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -3902,6 +4245,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@@ -3911,24 +4316,10 @@ static void ext4_wait_dax_page(struct inode *inode)
int ext4_break_layouts(struct inode *inode)
{
- struct page *page;
- int error;
-
if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
- do {
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
- } while (error == 0);
-
- return error;
+ return dax_break_layout_inode(inode, ext4_wait_dax_page);
}
/*
@@ -3946,91 +4337,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = sb->s_maxbytes;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
*/
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
+ if (offset >= inode->i_size || offset >= max_end)
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
- ret = file_modified(file);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -4040,54 +4396,53 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
- ext4_lblk_t hole_len = stop_block - first_block;
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block, hole_len);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
- ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4209,8 +4564,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4674,6 +5031,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
int err;
+ err = xattr_check_inode(inode, IHDR(inode, raw_inode),
+ ITAIL(inode, raw_inode));
+ if (err)
+ return err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
err = ext4_find_inline_data_nolock(inode);
if (!err && ext4_has_inline_data(inode))
@@ -4705,22 +5067,60 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
+ return -EFSCORRUPTED;
+}
+
+bool ext4_should_enable_large_folio(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_encrypt(sb))
+ return false;
+
+ return true;
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4732,7 +5132,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4741,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
- ino == le32_to_cpu(es->s_usr_quota_inum) ||
- ino == le32_to_cpu(es->s_grp_quota_inum) ||
- ino == le32_to_cpu(es->s_prj_quota_inum) ||
- ino == le32_to_cpu(es->s_orphan_file_inum))) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
@@ -4761,10 +5155,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -4800,15 +5194,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
@@ -4876,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4887,7 +5281,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
- if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ if (!ext4_has_feature_dir_index(sb) &&
+ ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@@ -5007,8 +5402,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else {
@@ -5037,13 +5440,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
+ if (ext4_should_enable_large_folio(inode))
+ mapping_set_large_folios(inode->i_mapping);
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
@@ -5228,8 +5642,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
@@ -5351,8 +5766,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ error = ext4_emergency_state(inode->i_sb);
+ if (unlikely(error))
+ return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -5464,7 +5880,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
oldsize & (inode->i_sb->s_blocksize - 1)) {
error = ext4_inode_attach_jinode(inode);
if (error)
- goto err_out;
+ goto out_mmap_sem;
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
@@ -5505,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
down_write(&EXT4_I(inode)->i_data_sem);
old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5518,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
else
EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5633,7 +6050,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max;
}
- generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
@@ -5714,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5723,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and lead blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks + pextents;
+ groups = idxblocks;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -5742,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5762,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
int ext4_writepage_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
@@ -5796,9 +6210,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err)) {
put_bh(iloc->bh);
- return -EIO;
+ return err;
}
ext4_fc_track_inode(handle, inode);
@@ -5822,8 +6237,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
@@ -5834,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;