diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-fs-f2fs | 56 | ||||
-rw-r--r-- | Documentation/filesystems/f2fs.rst | 122 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 53 | ||||
-rw-r--r-- | fs/f2fs/compress.c | 43 | ||||
-rw-r--r-- | fs/f2fs/data.c | 59 | ||||
-rw-r--r-- | fs/f2fs/dir.c | 17 | ||||
-rw-r--r-- | fs/f2fs/extent_cache.c | 15 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 88 | ||||
-rw-r--r-- | fs/f2fs/file.c | 49 | ||||
-rw-r--r-- | fs/f2fs/gc.c | 25 | ||||
-rw-r--r-- | fs/f2fs/node.c | 77 | ||||
-rw-r--r-- | fs/f2fs/node.h | 1 | ||||
-rw-r--r-- | fs/f2fs/recovery.c | 2 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 30 | ||||
-rw-r--r-- | fs/f2fs/segment.h | 28 | ||||
-rw-r--r-- | fs/f2fs/super.c | 121 | ||||
-rw-r--r-- | fs/f2fs/sysfs.c | 119 | ||||
-rw-r--r-- | include/linux/f2fs_fs.h | 1 |
18 files changed, 702 insertions, 204 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bc0e7fefc39d..b590809869ca 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -822,8 +822,8 @@ What: /sys/fs/f2fs/<disk>/gc_valid_thresh_ratio Date: September 2024 Contact: "Daeho Jeong" <daehojeong@google.com> Description: It controls the valid block ratio threshold not to trigger excessive GC - for zoned deivces. The initial value of it is 95(%). F2FS will stop the - background GC thread from intiating GC for sections having valid blocks + for zoned devices. The initial value of it is 95(%). F2FS will stop the + background GC thread from initiating GC for sections having valid blocks exceeding the ratio. What: /sys/fs/f2fs/<disk>/max_read_extent_count @@ -847,7 +847,7 @@ Description: For several zoned storage devices, vendors will provide extra space filesystem level GC. To do that, we can reserve the space using reserved_blocks. However, it is not enough, since this extra space should not be shown to users. So, with this new sysfs node, we can hide the space - by substracting reserved_blocks from total bytes. + by subtracting reserved_blocks from total bytes. What: /sys/fs/f2fs/<disk>/encoding_flags Date: April 2025 @@ -883,3 +883,53 @@ Date: June 2025 Contact: "Daeho Jeong" <daehojeong@google.com> Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy Default: 1 + +What: /sys/fs/f2fs/<disk>/effective_lookup_mode +Date: August 2025 +Contact: "Daniel Lee" <chullee@google.com> +Description: + This is a read-only entry to show the effective directory lookup mode + F2FS is currently using for casefolded directories. + This considers both the "lookup_mode" mount option and the on-disk + encoding flag, SB_ENC_NO_COMPAT_FALLBACK_FL. + + Possible values are: + - "perf": Hash-only lookup. + - "compat": Hash-based lookup with a linear search fallback enabled + - "auto:perf": lookup_mode is auto and fallback is disabled on-disk + - "auto:compat": lookup_mode is auto and fallback is enabled on-disk + +What: /sys/fs/f2fs/<disk>/bggc_io_aware +Date: August 2025 +Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com> +Description: Used to adjust the BG_GC priority when pending IO, with a default value + of 0. Specifically, for ZUFS, the default value is 1. + + ================== ====================================================== + value description + bggc_io_aware = 0 skip background GC if there is any kind of pending IO + bggc_io_aware = 1 skip background GC if there is pending read IO + bggc_io_aware = 2 don't aware IO for background GC + ================== ====================================================== + +What: /sys/fs/f2fs/<disk>/allocate_section_hint +Date: August 2025 +Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com> +Description: Indicates the hint section between the first device and others in multi-devices + setup. It defaults to the end of the first device in sections. For a single storage + device, it defaults to the total number of sections. It can be manually set to match + scenarios where multi-devices are mapped to the same dm device. + +What: /sys/fs/f2fs/<disk>/allocate_section_policy +Date: August 2025 +Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com> +Description: Controls write priority in multi-devices setups. A value of 0 means normal writing. + A value of 1 prioritizes writing to devices before the allocate_section_hint. A value of 2 + prioritizes writing to devices after the allocate_section_hint. The default is 0. + + =========================== ========================================================== + value description + allocate_section_policy = 0 Normal writing + allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint + allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint + =========================== ========================================================== diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index e5bb89452aff..a8d02fe5be83 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -1,8 +1,11 @@ .. SPDX-License-Identifier: GPL-2.0 -========================================== -WHAT IS Flash-Friendly File System (F2FS)? -========================================== +================================= +Flash-Friendly File System (F2FS) +================================= + +Overview +======== NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have been equipped on a variety systems ranging from mobile to server systems. Since @@ -173,9 +176,12 @@ data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. reserve_root=%d Support configuring reserved space which is used for allocation from a privileged user with specified uid or - gid, unit: 4KB, the default limit is 0.2% of user blocks. -resuid=%d The user ID which may use the reserved blocks. -resgid=%d The group ID which may use the reserved blocks. + gid, unit: 4KB, the default limit is 12.5% of user blocks. +reserve_node=%d Support configuring reserved nodes which are used for + allocation from a privileged user with specified uid or + gid, the default limit is 12.5% of all nodes. +resuid=%d The user ID which may use the reserved blocks and nodes. +resgid=%d The group ID which may use the reserved blocks and nodes. fault_injection=%d Enable fault injection in all supported types with specified injection rate. fault_type=%d Support configuring fault injection type, should be @@ -291,9 +297,13 @@ compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo" "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only "lz4" and "zstd" support compress level config. + + ========= =========== algorithm level range + ========= =========== lz4 3 - 16 zstd 1 - 22 + ========= =========== compress_log_size=%u Support configuring compress cluster size. The size will be 4KB * (1 << %u). The default and minimum sizes are 16KB. compress_extension=%s Support adding specified extension, so that f2fs can enable @@ -357,6 +367,7 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: panic immediately, continue without doing anything, and remount the partition in read-only mode. By default it uses "continue" mode. + ====================== =============== =============== ======== mode continue remount-ro panic ====================== =============== =============== ======== @@ -370,6 +381,25 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: ====================== =============== =============== ======== nat_bits Enable nat_bits feature to enhance full/empty nat blocks access, by default it's disabled. +lookup_mode=%s Control the directory lookup behavior for casefolded + directories. This option has no effect on directories + that do not have the casefold feature enabled. + + ================== ======================================== + Value Description + ================== ======================================== + perf (Default) Enforces a hash-only lookup. + The linear search fallback is always + disabled, ignoring the on-disk flag. + compat Enables the linear search fallback for + compatibility with directory entries + created by older kernel that used a + different case-folding algorithm. + This mode ignores the on-disk flag. + auto F2FS determines the mode based on the + on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL` + flag. + ================== ======================================== ======================== ============================================================ Debugfs Entries @@ -795,11 +825,13 @@ ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io +------------------------------------------------------------------ N/A COLD_DATA WRITE_LIFE_EXTREME N/A HOT_DATA WRITE_LIFE_SHORT N/A WARM_DATA WRITE_LIFE_NOT_SET -- direct io +------------------------------------------------------------------ WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET @@ -915,24 +947,26 @@ compression enabled files (refer to "Compression implementation" section for how enable compression on a regular inode). 1) compress_mode=fs -This is the default option. f2fs does automatic compression in the writeback of the -compression enabled files. + + This is the default option. f2fs does automatic compression in the writeback of the + compression enabled files. 2) compress_mode=user -This disables the automatic compression and gives the user discretion of choosing the -target file and the timing. The user can do manual compression/decompression on the -compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE -ioctls like the below. -To decompress a file, + This disables the automatic compression and gives the user discretion of choosing the + target file and the timing. The user can do manual compression/decompression on the + compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE + ioctls like the below. + +To decompress a file:: -fd = open(filename, O_WRONLY, 0); -ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); + fd = open(filename, O_WRONLY, 0); + ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); -To compress a file, +To compress a file:: -fd = open(filename, O_WRONLY, 0); -ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); + fd = open(filename, O_WRONLY, 0); + ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); NVMe Zoned Namespace devices ---------------------------- @@ -962,32 +996,32 @@ reserved and used by another filesystem or for different purposes. Once that external usage is complete, the device aliasing file can be deleted, releasing the reserved space back to F2FS for its own use. -<use-case> - -# ls /dev/vd* -/dev/vdb (32GB) /dev/vdc (32GB) -# mkfs.ext4 /dev/vdc -# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb -# mount /dev/vdb /mnt/f2fs -# ls -l /mnt/f2fs -vdc.file -# df -h -/dev/vdb 64G 33G 32G 52% /mnt/f2fs - -# mount -o loop /dev/vdc /mnt/ext4 -# df -h -/dev/vdb 64G 33G 32G 52% /mnt/f2fs -/dev/loop7 32G 24K 30G 1% /mnt/ext4 -# umount /mnt/ext4 - -# f2fs_io getflags /mnt/f2fs/vdc.file -get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable -# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file -get a flag on noimmutable ret=0, flags=800010 -set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable -# rm /mnt/f2fs/vdc.file -# df -h -/dev/vdb 64G 753M 64G 2% /mnt/f2fs +.. code-block:: + + # ls /dev/vd* + /dev/vdb (32GB) /dev/vdc (32GB) + # mkfs.ext4 /dev/vdc + # mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb + # mount /dev/vdb /mnt/f2fs + # ls -l /mnt/f2fs + vdc.file + # df -h + /dev/vdb 64G 33G 32G 52% /mnt/f2fs + + # mount -o loop /dev/vdc /mnt/ext4 + # df -h + /dev/vdb 64G 33G 32G 52% /mnt/f2fs + /dev/loop7 32G 24K 30G 1% /mnt/ext4 + # umount /mnt/ext4 + + # f2fs_io getflags /mnt/f2fs/vdc.file + get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable + # f2fs_io setflags noimmutable /mnt/f2fs/vdc.file + get a flag on noimmutable ret=0, flags=800010 + set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable + # rm /mnt/f2fs/vdc.file + # df -h + /dev/vdb 64G 753M 64G 2% /mnt/f2fs So, the key idea is, user can do any file operations on /dev/vdc, and reclaim the space after the use, while the space is counted as /data. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index db3831f7f2f5..bbe07e3a6c75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) return get_sectors_written(sbi->sb->s_bdev); } +static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type) +{ + cpc->stats.times[type] = ktime_get(); +} + +static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long long sb_diff, cur_diff; + enum cp_time ct; + + sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END], + sbi->cp_stats.times[CP_TIME_START]); + cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END], + cpc->stats.times[CP_TIME_START]); + + if (cur_diff > sb_diff) { + sbi->cp_stats = cpc->stats; + if (cur_diff < CP_LONG_LATENCY_THRESHOLD) + return; + + f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff); + for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++) + f2fs_warn(sbi, "Step#%d: %llu ms", ct, + (u64)ktime_ms_delta(cpc->stats.times[ct + 1], + cpc->stats.times[ct])); + } +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_META); + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); @@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_CP_META); + /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; + stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE); /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + stat_cp_time(cpc, CP_TIME_START); + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; @@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); + stat_cp_time(cpc, CP_TIME_LOCK); + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) @@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (err) goto out; + stat_cp_time(cpc, CP_TIME_OP_LOCK); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); @@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + stat_cp_time(cpc, CP_TIME_FLUSH_META); + /* save inmem log status */ f2fs_save_inmem_curseg(sbi); @@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) stat_inc_cp_count(sbi); stop: unblock_operations(sbi); + stat_cp_time(cpc, CP_TIME_END); + check_cp_time(sbi, cpc); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); @@ -1778,6 +1824,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; + req->delta_time = diff; complete(&req->wait); sum_diff += diff; @@ -1873,6 +1920,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) else flush_remained_ckpt_reqs(sbi, &req); + if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) { + f2fs_warn_ratelimited(sbi, + "blocked on checkpoint for %u ms", cprc->peak_time); + dump_stack(); + } + return req.ret; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5c1f47e45dab..6ad8d3bc6df7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; + struct page **rpages; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; + int i; int err; err = f2fs_is_compressed_cluster(inode, start_idx); @@ -1238,27 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) if (err <= 0) return err; - if (err > 0) { - struct page **rpages = fsdata; - int cluster_size = F2FS_I(inode)->i_cluster_size; - int i; - - for (i = cluster_size - 1; i >= 0; i--) { - struct folio *folio = page_folio(rpages[i]); - loff_t start = folio->index << PAGE_SHIFT; - - if (from <= start) { - folio_zero_segment(folio, 0, folio_size(folio)); - } else { - folio_zero_segment(folio, from - start, - folio_size(folio)); - break; - } - } + rpages = fsdata; + + for (i = (1 << log_cluster_size) - 1; i >= 0; i--) { + struct folio *folio = page_folio(rpages[i]); + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - f2fs_compress_write_end(inode, fsdata, start_idx, true); + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) + break; } - return 0; + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, 1 << log_cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock); } static int f2fs_write_compressed_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7961e0ddfca3..ef38e62cda8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { + blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA); + if (io->fio.op != fio->op) return false; - return io->fio.op_flags == fio->op_flags; + return (io->fio.op_flags & mask) == (fio->op_flags & mask); } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, @@ -911,7 +913,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1083,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, } /* This can handle encryption stuffs */ -static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, +static void f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { @@ -1092,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); - if (IS_ERR(bio)) - return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); - if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { - iostat_update_and_unbind_ctx(bio); - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); - return -EFAULT; - } + if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) + f2fs_bug_on(sbi, 1); + inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); - return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) @@ -1265,10 +1260,8 @@ got_it: return folio; } - err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, + f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); - if (err) - goto put_err; return folio; put_err: @@ -1572,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; + if (flag == F2FS_GET_BLOCK_PRECACHE) + mode = LOOKUP_NODE_RA; + next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) @@ -1778,12 +1774,13 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_read_extent_cache_range(&dn, - start_pgofs, map->m_pblk + ofs, - map->m_len - ofs); + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); } if (map->m_next_extent) - *map->m_next_extent = pgofs + 1; + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; } f2fs_put_dnode(&dn); unlock_out: @@ -2145,16 +2142,10 @@ submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } - if (bio == NULL) { + if (bio == NULL) bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - bio = NULL; - goto out; - } - } /* * If the page is under writeback, we need to wait for @@ -2303,18 +2294,10 @@ submit_and_realloc: bio = NULL; } - if (!bio) { + if (!bio) bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret, true); - f2fs_put_dnode(&dn); - *bio_ret = NULL; - return ret; - } - } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; @@ -3639,11 +3622,9 @@ repeat: err = -EFSCORRUPTED; goto put_folio; } - err = f2fs_submit_page_read(use_cow ? + f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); - if (err) - goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fffd7749d6d1..48f4f98afb01 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,21 @@ #include "xattr.h" #include <trace/events/f2fs.h> +static inline bool f2fs_should_fallback_to_linear(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return false; + case LOOKUP_COMPAT: + return true; + case LOOKUP_AUTO: + return !sb_no_casefold_compat_fallback(sbi->sb); + } + return false; +} + #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -366,7 +381,7 @@ start_find_entry: out: #if IS_ENABLED(CONFIG_UNICODE) - if (!sb_no_casefold_compat_fallback(dir->i_sb) && + if (f2fs_should_fallback_to_linear(dir) && IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 199c1e7a83ef..33e09c453c70 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { + f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, " + "extent node in rb tree [%u, %u, %u], age [%llu, %llu], " + "extent node to insert [%u, %u, %u], age [%llu, %llu]", + __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age, + en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks); f2fs_bug_on(sbi, 1); + return NULL; } } @@ -664,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode, if (!et) return; + if (unlikely(len == 0)) { + f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, " + "extent [%u, %u, %u], age [%llu, %llu]", + __func__, type, tei->fofs, tei->blk, tei->len, + tei->age, tei->last_blocks); + f2fs_bug_on(sbi, 1); + return; + } + if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e465bbc85ee..5b4e9548a231 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; * string rather than using the MS_LAZYTIME flag, so this must remain. */ #define F2FS_MOUNT_LAZYTIME 0x40000000 +#define F2FS_MOUNT_RESERVE_NODE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -155,6 +156,18 @@ enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; +enum bggc_io_aware_policy { + AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ + AWARE_READ_IO, /* skip background GC if there is pending read IO */ + AWARE_NONE, /* don't aware IO for background GC */ +}; + +enum device_allocation_policy { + ALLOCATE_FORWARD_NOHINT, + ALLOCATE_FORWARD_WITHIN_HINT, + ALLOCATE_FORWARD_FROM_HINT, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -172,6 +185,7 @@ struct f2fs_rwsem { struct f2fs_mount_info { unsigned int opt; block_t root_reserved_blocks; /* root reserved blocks */ + block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ @@ -212,6 +226,7 @@ struct f2fs_mount_info { int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 @@ -266,14 +281,36 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ +enum cp_time { + CP_TIME_START, /* begin */ + CP_TIME_LOCK, /* after cp_global_sem */ + CP_TIME_OP_LOCK, /* after block_operation */ + CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_SYNC_META, /* after sync_meta_pages */ + CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ + CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ + CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ + CP_TIME_FLUSH_DEVICE, /* after flush device cache */ + CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ + CP_TIME_END, /* after unblock_operation */ + CP_TIME_MAX, +}; + +/* time cost stats of checkpoint */ +struct cp_stats { + ktime_t times[CP_TIME_MAX]; +}; + struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; + struct cp_stats stats; }; /* @@ -334,7 +371,10 @@ struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ - ktime_t queue_time; /* request queued time */ + union { + ktime_t queue_time; /* request queued time */ + ktime_t delta_time; /* time in queue */ + }; }; struct ckpt_req_control { @@ -350,6 +390,9 @@ struct ckpt_req_control { unsigned int peak_time; /* peak wait time in msec until now */ }; +/* a time threshold that checkpoint was blocked for, unit: ms */ +#define CP_LONG_LATENCY_THRESHOLD 5000 + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1375,6 +1418,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1454,6 +1498,12 @@ enum { TOTAL_CALL = FOREGROUND, }; +enum f2fs_lookup_mode { + LOOKUP_PERF, + LOOKUP_COMPAT, + LOOKUP_AUTO, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1643,6 +1693,7 @@ struct f2fs_sb_info { unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ + struct cp_stats cp_stats; /* for time stat of checkpoint */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1810,6 +1861,9 @@ struct f2fs_sb_info { spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ + unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ + unsigned int allocate_section_hint; /* the boundary position between devices */ + unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; @@ -2362,13 +2416,11 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, +static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) return true; - if (!test_opt(sbi, RESERVE_ROOT)) - return false; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) @@ -2389,7 +2441,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { @@ -2747,7 +2799,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count; + unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; @@ -2769,15 +2821,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - avail_user_block_count = get_available_block_count(sbi, inode, false); + avail_user_block_count = get_available_block_count(sbi, inode, + test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } + avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (test_opt(sbi, RESERVE_NODE) && + !__allow_reserved_root(sbi, inode, true)) + avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; - if (unlikely(valid_node_count > sbi->total_node_count)) { + if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } @@ -3004,13 +3061,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (sbi->gc_mode == GC_URGENT_HIGH) return true; - if (zoned_gc) { - if (is_inflight_read_io(sbi)) - return false; - } else { - if (is_inflight_io(sbi, type)) - return false; - } + if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) + return false; + if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) + return false; if (sbi->gc_mode == GC_URGENT_MID) return true; @@ -3770,6 +3824,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); * node.c */ struct node_info; +enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); @@ -3792,7 +3847,8 @@ int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 42faaed6a02d..ffa045b39c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,15 +35,23 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> -static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) { loff_t old_size = i_size_read(inode); if (old_size >= new_size) return; + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); /* zero or drop pages only in range of [old_size, new_size] */ - truncate_pagecache(inode, old_size); + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); } static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); @@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode) /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); - if (err) + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); return err; + } } err = f2fs_truncate_blocks(inode, i_size_read(inode), true); @@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, filemap_invalidate_lock(inode->i_mapping); if (attr->ia_size > old_size) - f2fs_zero_post_eof_page(inode, attr->ia_size); + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); f2fs_balance_fs(sbi, true); @@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); return count; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 098e9f71421e..a7708cf80c04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1071,7 +1071,7 @@ next_step: } /* phase == 2 */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) continue; @@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return false; @@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto skip; + } + if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && @@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); @@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; @@ -2182,6 +2196,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; MAIN_SECS(sbi) += secs; + if (sbi->allocate_section_hint > MAIN_SECS(sbi)) + sbi->allocate_section_hint = MAIN_SECS(sbi); FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -2189,6 +2205,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; + sbi->allocate_section_hint = FDEV(0).total_segments / + SEGS_PER_SEC(sbi); + FDEV(last_dev).total_segments = (int)FDEV(last_dev).total_segments + segs; FDEV(last_dev).end_blk = diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..482a362f2625 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -871,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - nfolio[i] = f2fs_get_node_folio(sbi, nids[i]); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i], + NODE_TYPE_NON_INODE); if (IS_ERR(nfolio[i])) { err = PTR_ERR(nfolio[i]); f2fs_folio_put(nfolio[0], false); @@ -989,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - folio = f2fs_get_node_folio(sbi, dn->nid); + folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE); if (PTR_ERR(folio) == -ENOENT) return 1; else if (IS_ERR(folio)) @@ -1033,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid, + NODE_TYPE_NON_INODE); if (IS_ERR(folio)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); return PTR_ERR(folio); @@ -1111,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]); + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i], + NODE_TYPE_NON_INODE); if (IS_ERR(folios[i])) { err = PTR_ERR(folios[i]); idx = i - 1; @@ -1496,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - if (unlikely(nid != nid_of_node(folio) || - (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || - (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(folio))) || - time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - return -EFSCORRUPTED; + if (unlikely(nid != nid_of_node(folio))) + goto out_err; + + switch (ntype) { + case NODE_TYPE_INODE: + if (!IS_INODE(folio)) + goto out_err; + break; + case NODE_TYPE_XATTR: + if (!f2fs_has_xattr_block(ofs_of_node(folio))) + goto out_err; + break; + case NODE_TYPE_NON_INODE: + if (IS_INODE(folio)) + goto out_err; + break; + default: + break; } + if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER)) + goto out_err; return 0; +out_err: + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; } static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, @@ -1546,7 +1570,7 @@ repeat: if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto out_err; + goto out_put_err; } if (!f2fs_inode_chksum_verify(sbi, folio)) { @@ -1567,9 +1591,10 @@ out_put_err: return ERR_PTR(err); } -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type) { - return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR); + return __get_node_folio(sbi, nid, NULL, 0, node_type); } struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) @@ -2634,6 +2659,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 030390543b54..9cb8dcf8d417 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -57,6 +57,7 @@ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, }; /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4cb3a91801b4..215e442db72c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -548,7 +548,7 @@ got_it: } /* Get the node page */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return PTR_ERR(node_folio); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc82d42ef14c..b45eace879d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int alloc_policy = sbi->allocate_section_policy; + unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; int i; int ret = 0; @@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi, } #endif + /* + * Prevent allocate_section_hint from exceeding MAIN_SECS() + * due to desynchronization. + */ + if (alloc_policy != ALLOCATE_FORWARD_NOHINT && + alloc_hint > MAIN_SECS(sbi)) + alloc_hint = MAIN_SECS(sbi); + + if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && + hint < alloc_hint) + hint = alloc_hint; + else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && + hint >= alloc_hint) + hint = 0; + find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); @@ -3672,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_cow_file(inode)) + f2fs_is_cow_file(inode) || + is_inode_flag_set(inode, FI_NEED_IPU)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -3936,12 +3954,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); + int err; if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio)) { + err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (unlikely(err)) { + f2fs_err_ratelimited(fio->sbi, + "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", + __func__, fio->ino, folio->index, type, + fio->old_blkaddr, fio->new_blkaddr, err); if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e2ee5c686b1..1ce2c8abaf48 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } +static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, + enum log_type type, unsigned int segno) +{ + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, type)->next_blkoff; + return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); +} + static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) @@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, i)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, i, segno); blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; if (blocks > left_blocks) @@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); if (dent_blocks > left_blocks) return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2619cbbd7d2d..fd8e7b0b2166 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -143,6 +143,7 @@ enum { Opt_extent_cache, Opt_data_flush, Opt_reserve_root, + Opt_reserve_node, Opt_resgid, Opt_resuid, Opt_mode, @@ -181,6 +182,7 @@ enum { Opt_nat_bits, Opt_jqfmt, Opt_checkpoint, + Opt_lookup_mode, Opt_err, }; @@ -244,6 +246,13 @@ static const struct constant_table f2fs_param_errors[] = { {} }; +static const struct constant_table f2fs_param_lookup_mode[] = { + {"perf", LOOKUP_PERF}, + {"compat", LOOKUP_COMPAT}, + {"auto", LOOKUP_AUTO}, + {} +}; + static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), @@ -265,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_flag_no("extent_cache", Opt_extent_cache), fsparam_flag("data_flush", Opt_data_flush), fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_u32("reserve_node", Opt_reserve_node), fsparam_gid("resgid", Opt_resgid), fsparam_uid("resuid", Opt_resuid), fsparam_enum("mode", Opt_mode, f2fs_param_mode), @@ -300,6 +310,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), fsparam_flag("age_extent_cache", Opt_age_extent_cache), fsparam_enum("errors", Opt_errors, f2fs_param_errors), + fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode), {} }; @@ -336,6 +347,8 @@ static match_table_t f2fs_checkpoint_tokens = { #define F2FS_SPEC_discard_unit (1 << 21) #define F2FS_SPEC_memory_mode (1 << 22) #define F2FS_SPEC_errors (1 << 23) +#define F2FS_SPEC_lookup_mode (1 << 24) +#define F2FS_SPEC_reserve_node (1 << 25) struct f2fs_fs_context { struct f2fs_mount_info info; @@ -437,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count >> 3), + block_t block_limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); + block_t node_limit = sbi->total_node_count >> 3; /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && - F2FS_OPTION(sbi).root_reserved_blocks > limit) { - F2FS_OPTION(sbi).root_reserved_blocks = limit; + F2FS_OPTION(sbi).root_reserved_blocks > block_limit) { + F2FS_OPTION(sbi).root_reserved_blocks = block_limit; f2fs_info(sbi, "Reduce reserved blocks for root = %u", F2FS_OPTION(sbi).root_reserved_blocks); } - if (!test_opt(sbi, RESERVE_ROOT) && + if (test_opt(sbi, RESERVE_NODE) && + F2FS_OPTION(sbi).root_reserved_nodes > node_limit) { + F2FS_OPTION(sbi).root_reserved_nodes = node_limit; + f2fs_info(sbi, "Reduce reserved nodes for root = %u", + F2FS_OPTION(sbi).root_reserved_nodes); + } + if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) && (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) - f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root" + " and reserve_node", from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -847,6 +868,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; ctx->spec_mask |= F2FS_SPEC_reserve_root; break; + case Opt_reserve_node: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_node; + break; case Opt_resuid: F2FS_CTX_INFO(ctx).s_resuid = result.uid; ctx->spec_mask |= F2FS_SPEC_resuid; @@ -994,6 +1020,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; default: @@ -1149,6 +1179,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_nat_bits: ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); break; + case Opt_lookup_mode: + F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_lookup_mode; + break; } return 0; } @@ -1191,7 +1225,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (strcmp(old_qname, new_qname) == 0) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } @@ -1430,6 +1468,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc, ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; } + if (test_opt(sbi, RESERVE_NODE) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { + f2fs_info(sbi, "Preserve previous reserve_node=%u", + F2FS_OPTION(sbi).root_reserved_nodes); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE; + } err = f2fs_check_test_dummy_encryption(fc, sb); if (err) @@ -1629,6 +1675,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) if (ctx->spec_mask & F2FS_SPEC_reserve_root) F2FS_OPTION(sbi).root_reserved_blocks = F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_reserve_node) + F2FS_OPTION(sbi).root_reserved_nodes = + F2FS_CTX_INFO(ctx).root_reserved_nodes; if (ctx->spec_mask & F2FS_SPEC_resgid) F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; if (ctx->spec_mask & F2FS_SPEC_resuid) @@ -1658,6 +1707,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; if (ctx->spec_mask & F2FS_SPEC_errors) F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + if (ctx->spec_mask & F2FS_SPEC_lookup_mode) + F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode; f2fs_apply_compression(fc, sb); f2fs_apply_test_dummy_encryption(fc, sb); @@ -2349,9 +2400,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); - if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE)) + seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u," + "resgid=%u", F2FS_OPTION(sbi).root_reserved_blocks, + F2FS_OPTION(sbi).root_reserved_nodes, from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -2422,6 +2475,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, NAT_BITS)) seq_puts(seq, ",nat_bits"); + if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF) + seq_show_option(seq, "lookup_mode", "perf"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT) + seq_show_option(seq, "lookup_mode", "compat"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO) + seq_show_option(seq, "lookup_mode", "auto"); + return 0; } @@ -2486,6 +2546,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) #endif f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + + F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF; } #ifdef CONFIG_QUOTA @@ -2566,21 +2628,39 @@ out_unlock: restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err); return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - int retry = DEFAULT_RETRY_IO_COUNT; + unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + long long start, writeback, end; + + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DATA)); + + f2fs_update_time(sbi, ENABLE_TIME); + + start = ktime_get(); /* we should flush all the data to keep data consistency */ - do { - sync_inodes_sb(sbi->sb); + while (get_pages(sbi, F2FS_DIRTY_DATA)) { + writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); - if (unlikely(retry < 0)) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + if (f2fs_time_over(sbi, ENABLE_TIME)) + break; + } + writeback = ktime_get(); + + sync_inodes_sb(sbi->sb); + + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -2593,6 +2673,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); + + end = ktime_get(); + + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); } static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) @@ -4156,6 +4242,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = SEGS_TO_BLKS(sbi, ((le32_to_cpu(raw_super->segment_count_nat) / 2) * NAT_ENTRY_PER_BLOCK)); + sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count); + sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT; F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); @@ -4179,6 +4267,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); @@ -4637,9 +4726,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; + sbi->bggc_io_aware = AWARE_ALL_IO; #ifdef CONFIG_BLK_DEV_ZONED sbi->max_open_zones = UINT_MAX; sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ; + sbi->bggc_io_aware = AWARE_READ_IO; #endif for (i = 0; i < max_devices; i++) { @@ -4667,6 +4758,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) SEGS_TO_BLKS(sbi, FDEV(i).total_segments) - 1 + le32_to_cpu(raw_super->segment0_blkaddr); + sbi->allocate_section_hint = FDEV(i).total_segments / + SEGS_PER_SEC(sbi); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f736052dea50..6d2a4fba68a2 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a, le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); } +static ssize_t effective_lookup_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return sysfs_emit(buf, "perf\n"); + case LOOKUP_COMPAT: + return sysfs_emit(buf, "compat\n"); + case LOOKUP_AUTO: + if (sb_no_casefold_compat_fallback(sbi->sb)) + return sysfs_emit(buf, "auto:perf\n"); + return sysfs_emit(buf, "auto:compat\n"); + } + return 0; +} + static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -866,6 +882,27 @@ out: return count; } + if (!strcmp(a->attr.name, "bggc_io_aware")) { + if (t < AWARE_ALL_IO || t > AWARE_NONE) + return -EINVAL; + sbi->bggc_io_aware = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_hint")) { + if (t < 0 || t > MAIN_SECS(sbi)) + return -EINVAL; + sbi->allocate_section_hint = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_policy")) { + if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT) + return -EINVAL; + sbi->allocate_section_policy = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1138,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); F2FS_SBI_GENERAL_RW_ATTR(dir_level); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy); #ifdef CONFIG_F2FS_IOSTAT F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); @@ -1175,6 +1214,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); +F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1211,6 +1251,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(encoding_flags); +F2FS_GENERAL_RO_ATTR(effective_lookup_mode); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); @@ -1303,6 +1344,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), + ATTR_LIST(bggc_io_aware), #ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), @@ -1329,6 +1371,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), ATTR_LIST(encoding_flags), + ATTR_LIST(effective_lookup_mode), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), @@ -1371,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), ATTR_LIST(reserved_pin_section), + ATTR_LIST(allocate_section_hint), + ATTR_LIST(allocate_section_policy), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1723,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, seq_printf(seq, " Main : 0x%010x (%10d)\n", SM_I(sbi)->main_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); - seq_printf(seq, " # of Sections : %12d\n", - le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10); + seq_printf(seq, " Segment size : %12d MB\n", + (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); seq_printf(seq, " Segs/Sections : %12d\n", SEGS_PER_SEC(sbi)); seq_printf(seq, " Section size : %12d MB\n", - SEGS_PER_SEC(sbi) << 1); + (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); if (!f2fs_is_multi_device(sbi)) return 0; @@ -1742,6 +1790,69 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused donation_list_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct f2fs_inode_info *fi; + struct dentry *dentry; + char *buf, *path; + int i; + + buf = f2fs_getname(sbi); + if (!buf) + return 0; + + seq_printf(seq, "Donation List\n"); + seq_printf(seq, " # of files : %u\n", sbi->donate_files); + seq_printf(seq, " %-50s %10s %20s %20s %22s\n", + "File path", "Status", "Donation offset (kb)", + "Donation size (kb)", "File cached size (kb)"); + seq_printf(seq, "---\n"); + + for (i = 0; i < sbi->donate_files; i++) { + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock_shared(inode); + + dentry = d_find_alias(inode); + if (!dentry) { + path = NULL; + } else { + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) + goto next; + } + seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n", + path ? path : "<unlinked>", + is_inode_flag_set(inode, FI_DONATE_FINISHED) ? + "Evicted" : "Donated", + (loff_t)fi->donate_start << (PAGE_SHIFT - 10), + (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), + (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); +next: + dput(dentry); + inode_unlock_shared(inode); + iput(inode); + } + f2fs_putname(buf); + return 0; +} + #ifdef CONFIG_F2FS_FAULT_INJECTION static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, void *offset) @@ -1851,6 +1962,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); + proc_create_single_data("donation_list", 0444, sbi->s_proc, + donation_list_seq_show, sb); #ifdef CONFIG_F2FS_FAULT_INJECTION proc_create_single_data("inject_stats", 0444, sbi->s_proc, inject_stats_seq_show, sb); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f8b8bfc0e73..6afb4a13b81d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -79,6 +79,7 @@ enum stop_cp_reason { STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_CORRUPTED_FREE_BITMAP, + STOP_CP_REASON_CORRUPTED_NID, STOP_CP_REASON_MAX, }; |