From 2eeb0dce728a7eac3e4dfe355d98af40d61f7a26 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Jul 2021 10:30:58 -0700 Subject: f2fs: don't sleep while grabing nat_tree_lock This tries to fix priority inversion in the below condition resulting in long checkpoint delay. f2fs_get_node_info() - nat_tree_lock -> sleep to grab journal_rwsem by contention checkpoint - waiting for nat_tree_lock In order to let checkpoint go, let's release nat_tree_lock, if there's a journal_rwsem contention. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0be9e2d7120e..c945a9730f3c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, int i; ni->nid = nid; - +retry: /* Check nat cache */ down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); @@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, return 0; } - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!rwsem_is_locked(&sbi->cp_global_sem)) { + down_read(&curseg->journal_rwsem); + } else if (!down_read_trylock(&curseg->journal_rwsem)) { + up_read(&nm_i->nat_tree_lock); + goto retry; + } - /* Check current segment summary */ - down_read(&curseg->journal_rwsem); i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); -- cgit From b7ec2061737f12c33e45beeb967d17f31abc1ada Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Jul 2021 09:12:15 -0700 Subject: f2fs: do not submit NEW_ADDR to read node block After the below patch, give cp is errored, we drop dirty node pages. This can give NEW_ADDR to read node pages. Don't do WARN_ON() which gives generic/475 failure. Fixes: 28607bf3aa6f ("f2fs: drop dirty node pages when cp is in error status") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c945a9730f3c..5840b82ce311 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1330,7 +1330,8 @@ static int read_node_page(struct page *page, int op_flags) if (err) return err; - if (unlikely(ni.blk_addr == NULL_ADDR) || + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; -- cgit From 94afd6d6e5253179c9b891d02081cc8355a11768 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Aug 2021 10:23:48 +0800 Subject: f2fs: extent cache: support unaligned extent Compressed inode may suffer read performance issue due to it can not use extent cache, so I propose to add this unaligned extent support to improve it. Currently, it only works in readonly format f2fs image. Unaligned extent: in one compressed cluster, physical block number will be less than logical block number, so we add an extra physical block length in extent info in order to indicate such extent status. The idea is if one whole cluster blocks are contiguous physically, once its mapping info was readed at first time, we will cache an unaligned (or aligned) extent info entry in extent cache, it expects that the mapping info will be hitted when rereading cluster. Merge policy: - Aligned extents can be merged. - Aligned extent and unaligned extent can not be merged. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5840b82ce311..9d838a7929fb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -841,6 +841,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: return 0; release_pages: -- cgit From 324105775c1982aacbd2972b7024d8cc06474abe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Aug 2021 08:24:48 +0800 Subject: f2fs: support fault injection for f2fs_kmem_cache_alloc() This patch supports to inject fault into f2fs_kmem_cache_alloc(). Usage: a) echo 32768 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=32768 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d838a7929fb..161173de5a2d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -162,14 +162,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) { struct nat_entry *new; - if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); - else - new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -242,7 +241,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -329,7 +329,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, unsigned long flags; unsigned int seq_id; - fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); get_page(page); fn->page = page; @@ -428,7 +429,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; - new = __alloc_nat_entry(nid, false); + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -451,7 +452,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); @@ -2252,7 +2253,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(f2fs_check_nid_range(sbi, nid))) return false; - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; i->state = FREE_NID; @@ -2842,7 +2843,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = __alloc_nat_entry(nid, true); + ne = __alloc_nat_entry(sbi, nid, true); __init_nat_entry(nm_i, ne, &raw_ne, true); } -- cgit From 521187439abfb3e1c946796dc2187c443e5457ab Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 19 Aug 2021 20:52:28 -0700 Subject: f2fs: separate out iostat feature Added F2FS_IOSTAT config option to support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events and moved I/O statistics related codes into separate files for better maintenance. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu [Jaegeuk Kim: set default=y] Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 161173de5a2d..043cb831b289 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,6 +17,7 @@ #include "node.h" #include "segment.h" #include "xattr.h" +#include "iostat.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) -- cgit From 94c821fb286b545d37549ff30a0c341e066f0d6c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 20 Aug 2021 18:54:59 +0800 Subject: f2fs: rebuild nat_bits during umount If all free_nat_bitmap are available, we can rebuild nat_bits from free_nat_bitmap entirely during umount, let's make another chance to reenable nat_bits for image. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 101 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 23 deletions(-) (limited to 'fs/f2fs/node.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 043cb831b289..e863136081b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2213,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + up_read(&nm_i->nat_tree_lock); + + return ret; +} + static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2884,7 +2902,23 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2893,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; if (nat_index == 0) { @@ -2904,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - if (valid == 0) { - __set_bit_le(nat_index, nm_i->empty_nat_bits); - __clear_bit_le(nat_index, nm_i->full_nat_bits); - return; + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); } - __clear_bit_le(nat_index, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_index, nm_i->full_nat_bits); - else - __clear_bit_le(nat_index, nm_i->full_nat_bits); + up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -2933,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (enabled_nat_bits(sbi, cpc) || + if ((cpc->reason & CP_UMOUNT) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -2980,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); + update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3011,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (enabled_nat_bits(sbi, cpc)) { + if (cpc->reason & CP_UMOUNT) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); @@ -3027,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (enabled_nat_bits(sbi, cpc) || + if (cpc->reason & CP_UMOUNT || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3064,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; - if (!enabled_nat_bits(sbi, NULL)) - return 0; - nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3089,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - disable_nat_bits(sbi, true); + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); return 0; } - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3106,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; for (i = 0; i < nm_i->nat_blocks; i++) { -- cgit