summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs6
-rw-r--r--Documentation/filesystems/f2fs.rst131
-rw-r--r--fs/f2fs/checkpoint.c10
-rw-r--r--fs/f2fs/compress.c17
-rw-r--r--fs/f2fs/data.c50
-rw-r--r--fs/f2fs/debug.c29
-rw-r--r--fs/f2fs/extent_cache.c5
-rw-r--r--fs/f2fs/f2fs.h162
-rw-r--r--fs/f2fs/file.c26
-rw-r--r--fs/f2fs/gc.c165
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c6
-rw-r--r--fs/f2fs/namei.c39
-rw-r--r--fs/f2fs/recovery.c31
-rw-r--r--fs/f2fs/segment.c63
-rw-r--r--fs/f2fs/segment.h21
-rw-r--r--fs/f2fs/super.c208
-rw-r--r--fs/f2fs/sysfs.c9
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/f2fs/xattr.c32
-rw-r--r--fs/f2fs/xattr.h10
-rw-r--r--include/linux/f2fs_fs.h5
-rw-r--r--include/trace/events/f2fs.h59
-rw-r--r--include/trace/events/io_uring.h12
-rw-r--r--io_uring/io-wq.c5
-rw-r--r--io_uring/kbuf.c16
-rw-r--r--io_uring/poll.c52
-rw-r--r--io_uring/rsrc.c47
29 files changed, 710 insertions, 514 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index b590809869ca..770470e0598b 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -643,6 +643,12 @@ Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of unusable blocks in a section which was defined by
the zone capacity reported by underlying zoned device.
+What: /sys/fs/f2fs/<disk>/max_open_zones
+Date: November 2025
+Contact: "Yongpeng Yang" <yangyongpeng@xiaomi.com>
+Description: Shows the max number of zones that F2FS can write concurrently when a zoned
+ device is mounted.
+
What: /sys/fs/f2fs/<disk>/current_atomic_write
Date: July 2022
Contact: "Daeho Jeong" <daehojeong@google.com>
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index a8d02fe5be83..cb90d1ae82d0 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -188,34 +188,36 @@ fault_type=%d Support configuring fault injection type, should be
enabled with fault_injection option, fault type value
is shown below, it supports single or combined type.
- =========================== ==========
- Type_Name Type_Value
- =========================== ==========
- FAULT_KMALLOC 0x00000001
- FAULT_KVMALLOC 0x00000002
- FAULT_PAGE_ALLOC 0x00000004
- FAULT_PAGE_GET 0x00000008
- FAULT_ALLOC_BIO 0x00000010 (obsolete)
- FAULT_ALLOC_NID 0x00000020
- FAULT_ORPHAN 0x00000040
- FAULT_BLOCK 0x00000080
- FAULT_DIR_DEPTH 0x00000100
- FAULT_EVICT_INODE 0x00000200
- FAULT_TRUNCATE 0x00000400
- FAULT_READ_IO 0x00000800
- FAULT_CHECKPOINT 0x00001000
- FAULT_DISCARD 0x00002000
- FAULT_WRITE_IO 0x00004000
- FAULT_SLAB_ALLOC 0x00008000
- FAULT_DQUOT_INIT 0x00010000
- FAULT_LOCK_OP 0x00020000
- FAULT_BLKADDR_VALIDITY 0x00040000
- FAULT_BLKADDR_CONSISTENCE 0x00080000
- FAULT_NO_SEGMENT 0x00100000
- FAULT_INCONSISTENT_FOOTER 0x00200000
- FAULT_TIMEOUT 0x00400000 (1000ms)
- FAULT_VMALLOC 0x00800000
- =========================== ==========
+ .. code-block:: none
+
+ =========================== ==========
+ Type_Name Type_Value
+ =========================== ==========
+ FAULT_KMALLOC 0x00000001
+ FAULT_KVMALLOC 0x00000002
+ FAULT_PAGE_ALLOC 0x00000004
+ FAULT_PAGE_GET 0x00000008
+ FAULT_ALLOC_BIO 0x00000010 (obsolete)
+ FAULT_ALLOC_NID 0x00000020
+ FAULT_ORPHAN 0x00000040
+ FAULT_BLOCK 0x00000080
+ FAULT_DIR_DEPTH 0x00000100
+ FAULT_EVICT_INODE 0x00000200
+ FAULT_TRUNCATE 0x00000400
+ FAULT_READ_IO 0x00000800
+ FAULT_CHECKPOINT 0x00001000
+ FAULT_DISCARD 0x00002000
+ FAULT_WRITE_IO 0x00004000
+ FAULT_SLAB_ALLOC 0x00008000
+ FAULT_DQUOT_INIT 0x00010000
+ FAULT_LOCK_OP 0x00020000
+ FAULT_BLKADDR_VALIDITY 0x00040000
+ FAULT_BLKADDR_CONSISTENCE 0x00080000
+ FAULT_NO_SEGMENT 0x00100000
+ FAULT_INCONSISTENT_FOOTER 0x00200000
+ FAULT_TIMEOUT 0x00400000 (1000ms)
+ FAULT_VMALLOC 0x00800000
+ =========================== ==========
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
@@ -296,14 +298,15 @@ nocheckpoint_merge Disable checkpoint merge feature.
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
"lz4", "zstd" and "lzo-rle" algorithm.
compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
- "lz4" and "zstd" support compress level config.
-
- ========= ===========
- algorithm level range
- ========= ===========
- lz4 3 - 16
- zstd 1 - 22
- ========= ===========
+ "lz4" and "zstd" support compress level config::
+
+ ========= ===========
+ algorithm level range
+ ========= ===========
+ lz4 3 - 16
+ zstd 1 - 22
+ ========= ===========
+
compress_log_size=%u Support configuring compress cluster size. The size will
be 4KB * (1 << %u). The default and minimum sizes are 16KB.
compress_extension=%s Support adding specified extension, so that f2fs can enable
@@ -368,38 +371,42 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes:
the partition in read-only mode. By default it uses "continue"
mode.
- ====================== =============== =============== ========
- mode continue remount-ro panic
- ====================== =============== =============== ========
- access ops normal normal N/A
- syscall errors -EIO -EROFS N/A
- mount option rw ro N/A
- pending dir write keep keep N/A
- pending non-dir write drop keep N/A
- pending node write drop keep N/A
- pending meta write keep keep N/A
- ====================== =============== =============== ========
+ .. code-block:: none
+
+ ====================== =============== =============== ========
+ mode continue remount-ro panic
+ ====================== =============== =============== ========
+ access ops normal normal N/A
+ syscall errors -EIO -EROFS N/A
+ mount option rw ro N/A
+ pending dir write keep keep N/A
+ pending non-dir write drop keep N/A
+ pending node write drop keep N/A
+ pending meta write keep keep N/A
+ ====================== =============== =============== ========
nat_bits Enable nat_bits feature to enhance full/empty nat blocks access,
by default it's disabled.
lookup_mode=%s Control the directory lookup behavior for casefolded
directories. This option has no effect on directories
that do not have the casefold feature enabled.
- ================== ========================================
- Value Description
- ================== ========================================
- perf (Default) Enforces a hash-only lookup.
- The linear search fallback is always
- disabled, ignoring the on-disk flag.
- compat Enables the linear search fallback for
- compatibility with directory entries
- created by older kernel that used a
- different case-folding algorithm.
- This mode ignores the on-disk flag.
- auto F2FS determines the mode based on the
- on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL`
- flag.
- ================== ========================================
+ .. code-block:: none
+
+ ================== ========================================
+ Value Description
+ ================== ========================================
+ perf (Default) Enforces a hash-only lookup.
+ The linear search fallback is always
+ disabled, ignoring the on-disk flag.
+ compat Enables the linear search fallback for
+ compatibility with directory entries
+ created by older kernel that used a
+ different case-folding algorithm.
+ This mode ignores the on-disk flag.
+ auto F2FS determines the mode based on the
+ on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL`
+ flag.
+ ================== ========================================
======================== ============================================================
Debugfs Entries
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bbe07e3a6c75..300664269eb6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1318,7 +1318,7 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
f2fs_submit_merged_write(sbi, DATA);
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -1673,7 +1673,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
goto out;
}
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS);
err = block_operations(sbi);
if (err)
@@ -1681,7 +1681,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
stat_cp_time(cpc, CP_TIME_OP_LOCK);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS);
f2fs_flush_merged_writes(sbi);
@@ -1747,7 +1747,7 @@ stop:
/* update CP_TIME to trigger checkpoint periodically */
f2fs_update_time(sbi, CP_TIME);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT);
out:
if (cpc->reason != CP_RESIZE)
f2fs_up_write(&sbi->cp_global_sem);
@@ -1974,7 +1974,7 @@ void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi)
/* Let's wait for the previous dispatched checkpoint. */
while (atomic_read(&cprc->queued_ckpt))
- io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
}
void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d7e6f563b3e4..7b68bf22989d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -120,7 +120,7 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len)
}
static void f2fs_put_rpages_wbc(struct compress_ctx *cc,
- struct writeback_control *wbc, bool redirty, int unlock)
+ struct writeback_control *wbc, bool redirty, bool unlock)
{
unsigned int i;
@@ -759,10 +759,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
ret = -EFSCORRUPTED;
/* Avoid f2fs_commit_super in irq context */
- if (!in_task)
- f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION);
- else
- f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION);
+ f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION);
goto out_release;
}
@@ -1060,7 +1057,7 @@ static void cancel_cluster_writeback(struct compress_ctx *cc,
f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA);
while (atomic_read(&cic->pending_pages) !=
(cc->valid_nr_cpages - submitted + 1))
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
}
/* Cancel writeback and stay locked. */
@@ -1205,7 +1202,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
if (copied)
set_cluster_dirty(&cc);
- f2fs_put_rpages_wbc(&cc, NULL, false, 1);
+ f2fs_put_rpages_wbc(&cc, NULL, false, true);
f2fs_destroy_compress_ctx(&cc, false);
return first_index;
@@ -1577,7 +1574,7 @@ continue_unlock:
*/
if (IS_NOQUOTA(cc->inode))
goto out;
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
goto retry_write;
}
goto out;
@@ -1608,7 +1605,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
add_compr_block_stat(cc->inode, cc->cluster_size);
goto write;
} else if (err) {
- f2fs_put_rpages_wbc(cc, wbc, true, 1);
+ f2fs_put_rpages_wbc(cc, wbc, true, true);
goto destroy_out;
}
@@ -1622,7 +1619,7 @@ write:
f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
- f2fs_put_rpages_wbc(cc, wbc, false, 0);
+ f2fs_put_rpages_wbc(cc, wbc, false, false);
destroy_out:
f2fs_destroy_compress_ctx(cc, false);
return err;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8bf4feda42b0..c30e69392a62 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -752,7 +752,7 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
}
static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
- struct page *page, enum temp_type temp)
+ struct folio *folio, enum temp_type temp)
{
struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
struct bio_entry *be;
@@ -761,8 +761,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
be->bio = bio;
bio_get(bio);
- if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
- f2fs_bug_on(sbi, 1);
+ bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
f2fs_down_write(&io->bio_list_lock);
list_add_tail(&be->list, &io->bio_list);
@@ -776,7 +775,7 @@ static void del_bio_entry(struct bio_entry *be)
}
static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
- struct page *page)
+ struct folio *folio)
{
struct folio *fio_folio = fio->folio;
struct f2fs_sb_info *sbi = fio->sbi;
@@ -802,8 +801,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
if (f2fs_crypt_mergeable_bio(*bio,
fio_folio->mapping->host,
fio_folio->index, fio) &&
- bio_add_page(*bio, page, PAGE_SIZE, 0) ==
- PAGE_SIZE) {
+ bio_add_folio(*bio, folio, folio_size(folio), 0)) {
ret = 0;
break;
}
@@ -904,9 +902,9 @@ alloc_new:
f2fs_set_bio_crypt_ctx(bio, folio->mapping->host,
folio->index, fio, GFP_NOIO);
- add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp);
+ add_bio_entry(fio->sbi, bio, data_folio, fio->temp);
} else {
- if (add_ipu_page(fio, &bio, &data_folio->page))
+ if (add_ipu_page(fio, &bio, data_folio))
goto alloc_new;
}
@@ -1275,7 +1273,7 @@ struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct folio *folio;
- folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0);
+ folio = f2fs_filemap_get_folio(mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio))
goto read;
if (folio_test_uptodate(folio))
@@ -1420,6 +1418,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag)
{
+ f2fs_down_read(&sbi->cp_enable_rwsem);
if (flag == F2FS_GET_BLOCK_PRE_AIO)
f2fs_down_read(&sbi->node_change);
else
@@ -1432,6 +1431,7 @@ static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag)
f2fs_up_read(&sbi->node_change);
else
f2fs_unlock_op(sbi);
+ f2fs_up_read(&sbi->cp_enable_rwsem);
}
int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index)
@@ -3138,8 +3138,8 @@ result:
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
- f2fs_io_schedule_timeout(
- DEFAULT_IO_TIMEOUT);
+ f2fs_schedule_timeout(
+ DEFAULT_SCHEDULE_TIMEOUT);
goto retry_write;
}
goto next;
@@ -3221,6 +3221,19 @@ static inline bool __should_serialize_io(struct inode *inode,
return false;
}
+static inline void account_writeback(struct inode *inode, bool inc)
+{
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return;
+
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
+ if (inc)
+ atomic_inc(&F2FS_I(inode)->writeback);
+ else
+ atomic_dec(&F2FS_I(inode)->writeback);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
+}
+
static int __f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc,
enum iostat_type io_type)
@@ -3266,10 +3279,14 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
locked = true;
}
+ account_writeback(inode, true);
+
blk_start_plug(&plug);
ret = f2fs_write_cache_pages(mapping, wbc, io_type);
blk_finish_plug(&plug);
+ account_writeback(inode, false);
+
if (locked)
mutex_unlock(&sbi->writepages);
@@ -3566,8 +3583,9 @@ repeat:
* Do not use FGP_STABLE to avoid deadlock.
* Will wait that below with our IO control.
*/
- folio = __filemap_get_folio(mapping, index,
- FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
+ folio = f2fs_filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_NOFS,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
goto fail;
@@ -3637,8 +3655,7 @@ repeat:
return 0;
put_folio:
- folio_unlock(folio);
- folio_put(folio);
+ f2fs_folio_put(folio, true);
fail:
f2fs_write_failed(inode, pos + len);
return err;
@@ -3694,8 +3711,7 @@ static int f2fs_write_end(const struct kiocb *iocb,
pos + copied);
}
unlock_out:
- folio_unlock(folio);
- folio_put(folio);
+ f2fs_folio_put(folio, true);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 43a83bbd3bc5..032683835569 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -251,6 +251,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
+ si->blkoff[i] = curseg->next_blkoff;
si->curseg[i] = curseg->segno;
si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
@@ -508,55 +509,63 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
- seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n",
- "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk");
- seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " TYPE %8s %8s %8s %8s %10s %10s %10s\n",
+ "blkoff", "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk");
+ seq_printf(s, " - COLD data: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_COLD_DATA],
si->curseg[CURSEG_COLD_DATA],
si->cursec[CURSEG_COLD_DATA],
si->curzone[CURSEG_COLD_DATA],
si->dirty_seg[CURSEG_COLD_DATA],
si->full_seg[CURSEG_COLD_DATA],
si->valid_blks[CURSEG_COLD_DATA]);
- seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " - WARM data: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_WARM_DATA],
si->curseg[CURSEG_WARM_DATA],
si->cursec[CURSEG_WARM_DATA],
si->curzone[CURSEG_WARM_DATA],
si->dirty_seg[CURSEG_WARM_DATA],
si->full_seg[CURSEG_WARM_DATA],
si->valid_blks[CURSEG_WARM_DATA]);
- seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " - HOT data: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_HOT_DATA],
si->curseg[CURSEG_HOT_DATA],
si->cursec[CURSEG_HOT_DATA],
si->curzone[CURSEG_HOT_DATA],
si->dirty_seg[CURSEG_HOT_DATA],
si->full_seg[CURSEG_HOT_DATA],
si->valid_blks[CURSEG_HOT_DATA]);
- seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " - Dir dnode: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_HOT_NODE],
si->curseg[CURSEG_HOT_NODE],
si->cursec[CURSEG_HOT_NODE],
si->curzone[CURSEG_HOT_NODE],
si->dirty_seg[CURSEG_HOT_NODE],
si->full_seg[CURSEG_HOT_NODE],
si->valid_blks[CURSEG_HOT_NODE]);
- seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " - File dnode: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_WARM_NODE],
si->curseg[CURSEG_WARM_NODE],
si->cursec[CURSEG_WARM_NODE],
si->curzone[CURSEG_WARM_NODE],
si->dirty_seg[CURSEG_WARM_NODE],
si->full_seg[CURSEG_WARM_NODE],
si->valid_blks[CURSEG_WARM_NODE]);
- seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n",
+ seq_printf(s, " - Indir nodes: %8d %8d %8d %8d %10u %10u %10u\n",
+ si->blkoff[CURSEG_COLD_NODE],
si->curseg[CURSEG_COLD_NODE],
si->cursec[CURSEG_COLD_NODE],
si->curzone[CURSEG_COLD_NODE],
si->dirty_seg[CURSEG_COLD_NODE],
si->full_seg[CURSEG_COLD_NODE],
si->valid_blks[CURSEG_COLD_NODE]);
- seq_printf(s, " - Pinned file: %8d %8d %8d\n",
+ seq_printf(s, " - Pinned file: %8d %8d %8d %8d\n",
+ si->blkoff[CURSEG_COLD_DATA_PINNED],
si->curseg[CURSEG_COLD_DATA_PINNED],
si->cursec[CURSEG_COLD_DATA_PINNED],
si->curzone[CURSEG_COLD_DATA_PINNED]);
- seq_printf(s, " - ATGC data: %8d %8d %8d\n",
+ seq_printf(s, " - ATGC data: %8d %8d %8d %8d\n",
+ si->blkoff[CURSEG_ALL_DATA_ATGC],
si->curseg[CURSEG_ALL_DATA_ATGC],
si->cursec[CURSEG_ALL_DATA_ATGC],
si->curzone[CURSEG_ALL_DATA_ATGC]);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 33e09c453c70..0ed84cc065a7 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -808,7 +808,7 @@ static void __update_extent_tree_range(struct inode *inode,
}
goto out_read_extent_cache;
update_age_extent_cache:
- if (!tei->last_blocks)
+ if (tei->last_blocks == F2FS_EXTENT_AGE_INVALID)
goto out_read_extent_cache;
__set_extent_info(&ei, fofs, len, 0, false,
@@ -912,7 +912,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
cur_age = cur_blocks - tei.last_blocks;
else
/* allocated_data_blocks overflow */
- cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks;
+ cur_age = (ULLONG_MAX - 1) - tei.last_blocks + cur_blocks;
if (tei.age)
ei->age = __calculate_block_age(sbi, cur_age, tei.age);
@@ -1114,6 +1114,7 @@ void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
struct extent_info ei = {
.fofs = fofs,
.len = len,
+ .last_blocks = F2FS_EXTENT_AGE_INVALID,
};
if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5b4e9548a231..20edbb99b814 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -96,47 +96,52 @@ extern const char *f2fs_fault_name[FAULT_MAX];
/*
* For mount options
*/
-#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000001
-#define F2FS_MOUNT_DISCARD 0x00000002
-#define F2FS_MOUNT_NOHEAP 0x00000004
-#define F2FS_MOUNT_XATTR_USER 0x00000008
-#define F2FS_MOUNT_POSIX_ACL 0x00000010
-#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000020
-#define F2FS_MOUNT_INLINE_XATTR 0x00000040
-#define F2FS_MOUNT_INLINE_DATA 0x00000080
-#define F2FS_MOUNT_INLINE_DENTRY 0x00000100
-#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
-#define F2FS_MOUNT_NOBARRIER 0x00000400
-#define F2FS_MOUNT_FASTBOOT 0x00000800
-#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00001000
-#define F2FS_MOUNT_DATA_FLUSH 0x00002000
-#define F2FS_MOUNT_FAULT_INJECTION 0x00004000
-#define F2FS_MOUNT_USRQUOTA 0x00008000
-#define F2FS_MOUNT_GRPQUOTA 0x00010000
-#define F2FS_MOUNT_PRJQUOTA 0x00020000
-#define F2FS_MOUNT_QUOTA 0x00040000
-#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00080000
-#define F2FS_MOUNT_RESERVE_ROOT 0x00100000
-#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x00200000
-#define F2FS_MOUNT_NORECOVERY 0x00400000
-#define F2FS_MOUNT_ATGC 0x00800000
-#define F2FS_MOUNT_MERGE_CHECKPOINT 0x01000000
-#define F2FS_MOUNT_GC_MERGE 0x02000000
-#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000
-#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000
-#define F2FS_MOUNT_NAT_BITS 0x10000000
-#define F2FS_MOUNT_INLINECRYPT 0x20000000
-/*
- * Some f2fs environments expect to be able to pass the "lazytime" option
- * string rather than using the MS_LAZYTIME flag, so this must remain.
- */
-#define F2FS_MOUNT_LAZYTIME 0x40000000
-#define F2FS_MOUNT_RESERVE_NODE 0x80000000
+enum f2fs_mount_opt {
+ F2FS_MOUNT_DISABLE_ROLL_FORWARD,
+ F2FS_MOUNT_DISCARD,
+ F2FS_MOUNT_NOHEAP,
+ F2FS_MOUNT_XATTR_USER,
+ F2FS_MOUNT_POSIX_ACL,
+ F2FS_MOUNT_DISABLE_EXT_IDENTIFY,
+ F2FS_MOUNT_INLINE_XATTR,
+ F2FS_MOUNT_INLINE_DATA,
+ F2FS_MOUNT_INLINE_DENTRY,
+ F2FS_MOUNT_FLUSH_MERGE,
+ F2FS_MOUNT_NOBARRIER,
+ F2FS_MOUNT_FASTBOOT,
+ F2FS_MOUNT_READ_EXTENT_CACHE,
+ F2FS_MOUNT_DATA_FLUSH,
+ F2FS_MOUNT_FAULT_INJECTION,
+ F2FS_MOUNT_USRQUOTA,
+ F2FS_MOUNT_GRPQUOTA,
+ F2FS_MOUNT_PRJQUOTA,
+ F2FS_MOUNT_QUOTA,
+ F2FS_MOUNT_INLINE_XATTR_SIZE,
+ F2FS_MOUNT_RESERVE_ROOT,
+ F2FS_MOUNT_DISABLE_CHECKPOINT,
+ F2FS_MOUNT_NORECOVERY,
+ F2FS_MOUNT_ATGC,
+ F2FS_MOUNT_MERGE_CHECKPOINT,
+ F2FS_MOUNT_GC_MERGE,
+ F2FS_MOUNT_COMPRESS_CACHE,
+ F2FS_MOUNT_AGE_EXTENT_CACHE,
+ F2FS_MOUNT_NAT_BITS,
+ F2FS_MOUNT_INLINECRYPT,
+ /*
+ * Some f2fs environments expect to be able to pass the "lazytime" option
+ * string rather than using the MS_LAZYTIME flag, so this must remain.
+ */
+ F2FS_MOUNT_LAZYTIME,
+ F2FS_MOUNT_RESERVE_NODE,
+};
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
-#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option)
+#define clear_opt(sbi, option) \
+ (F2FS_OPTION(sbi).opt &= ~BIT(F2FS_MOUNT_##option))
+#define set_opt(sbi, option) \
+ (F2FS_OPTION(sbi).opt |= BIT(F2FS_MOUNT_##option))
+#define test_opt(sbi, option) \
+ (F2FS_OPTION(sbi).opt & BIT(F2FS_MOUNT_##option))
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -183,7 +188,7 @@ struct f2fs_rwsem {
};
struct f2fs_mount_info {
- unsigned int opt;
+ unsigned long long opt;
block_t root_reserved_blocks; /* root reserved blocks */
block_t root_reserved_nodes; /* root reserved nodes */
kuid_t s_resuid; /* reserved blocks for uid */
@@ -245,6 +250,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_COMPRESSION 0x00002000
#define F2FS_FEATURE_RO 0x00004000
#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000
+#define F2FS_FEATURE_PACKED_SSA 0x00010000
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -281,7 +287,7 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
-#define DEF_ENABLE_INTERVAL 16 /* 16 secs */
+#define DEF_ENABLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
@@ -313,6 +319,12 @@ struct cp_control {
struct cp_stats stats;
};
+enum f2fs_cp_phase {
+ CP_PHASE_START_BLOCK_OPS,
+ CP_PHASE_FINISH_BLOCK_OPS,
+ CP_PHASE_FINISH_CHECKPOINT,
+};
+
/*
* indicate meta/data type
*/
@@ -406,6 +418,8 @@ struct discard_entry {
#define DEFAULT_DISCARD_GRANULARITY 16
/* default maximum discard granularity of ordered discard, unit: block count */
#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16
+/* default interval of periodical discard submission */
+#define DEFAULT_DISCARD_INTERVAL (msecs_to_jiffies(20))
/* max discard pend list number */
#define MAX_PLIST_NUM 512
@@ -655,8 +669,8 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */
-/* congestion wait timeout value, default: 20ms */
-#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
+/* IO/non-IO congestion wait timeout value, default: 1ms */
+#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1))
/* timeout value injected, default: 1000ms */
#define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000))
@@ -707,6 +721,12 @@ enum extent_type {
NR_EXTENT_CACHES,
};
+/*
+ * Reserved value to mark invalid age extents, hence valid block range
+ * from 0 to ULLONG_MAX-1
+ */
+#define F2FS_EXTENT_AGE_INVALID ULLONG_MAX
+
struct extent_info {
unsigned int fofs; /* start offset in a file */
unsigned int len; /* length of the extent */
@@ -947,6 +967,7 @@ struct f2fs_inode_info {
unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
unsigned char i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
+ atomic_t writeback; /* count # of writeback thread */
unsigned int atomic_write_cnt;
loff_t original_i_size; /* original i_size before atomic write */
@@ -1661,6 +1682,7 @@ struct f2fs_sb_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int unusable_blocks_per_sec; /* unusable blocks per section */
unsigned int max_open_zones; /* max open zone resources of the zoned device */
/* For adjust the priority writing position of data in zone UFS */
unsigned int blkzone_alloc_policy;
@@ -1694,6 +1716,7 @@ struct f2fs_sb_info {
long interval_time[MAX_TIME]; /* to store thresholds */
struct ckpt_req_control cprc_info; /* for checkpoint request control */
struct cp_stats cp_stats; /* for time stat of checkpoint */
+ struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -1732,7 +1755,6 @@ struct f2fs_sb_info {
unsigned int meta_ino_num; /* meta inode number*/
unsigned int log_blocks_per_seg; /* log2 blocks per segment */
unsigned int blocks_per_seg; /* blocks per segment */
- unsigned int unusable_blocks_per_sec; /* unusable blocks per section */
unsigned int segs_per_sec; /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections; /* total section count */
@@ -1884,9 +1906,6 @@ struct f2fs_sb_info {
spinlock_t error_lock; /* protect errors/stop_reason array */
bool error_dirty; /* errors of sb is dirty */
- struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
- unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
-
/* For reclaimed segs statistics per each GC mode */
unsigned int gc_segment_mode; /* GC state for reclaimed segments */
unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */
@@ -2096,7 +2115,7 @@ static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio,
pgoff_t index)
{
- pgoff_t idx_in_folio = index % (1 << folio_order(folio));
+ pgoff_t idx_in_folio = index % folio_nr_pages(folio);
return (struct f2fs_super_block *)
(page_address(folio_page(folio, idx_in_folio)) +
@@ -2961,16 +2980,6 @@ static inline struct folio *f2fs_filemap_get_folio(
return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask);
}
-static inline struct page *f2fs_pagecache_get_page(
- struct address_space *mapping, pgoff_t index,
- fgf_t fgp_flags, gfp_t gfp_mask)
-{
- if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
- return NULL;
-
- return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
-}
-
static inline void f2fs_folio_put(struct folio *folio, bool unlock)
{
if (IS_ERR_OR_NULL(folio))
@@ -2983,7 +2992,7 @@ static inline void f2fs_folio_put(struct folio *folio, bool unlock)
folio_put(folio);
}
-static inline void f2fs_put_page(struct page *page, int unlock)
+static inline void f2fs_put_page(struct page *page, bool unlock)
{
if (!page)
return;
@@ -3810,7 +3819,6 @@ void f2fs_quota_off_umount(struct super_block *sb);
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag);
void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason);
void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error);
-void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
@@ -4186,6 +4194,7 @@ struct f2fs_stat_info {
int gc_secs[2][2];
int tot_blks, data_blks, node_blks;
int bg_data_blks, bg_node_blks;
+ int blkoff[NR_CURSEG_TYPE];
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
@@ -4674,7 +4683,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
f2fs_up_write(&fi->i_sem);
return true;
}
- if (f2fs_is_mmap_file(inode) ||
+ if (f2fs_is_mmap_file(inode) || atomic_read(&fi->writeback) ||
(S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) {
f2fs_up_write(&fi->i_sem);
return false;
@@ -4710,6 +4719,7 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
F2FS_FEATURE_FUNCS(readonly, RO);
F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS);
+F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA);
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi,
@@ -4764,6 +4774,18 @@ static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
return false;
}
+static inline unsigned int f2fs_hw_discard_granularity(struct f2fs_sb_info *sbi)
+{
+ int i = 1;
+ unsigned int discard_granularity = bdev_discard_granularity(sbi->sb->s_bdev);
+
+ if (f2fs_is_multi_device(sbi))
+ for (; i < sbi->s_ndevs && !bdev_is_zoned(FDEV(i).bdev); i++)
+ discard_granularity = max_t(unsigned int, discard_granularity,
+ bdev_discard_granularity(FDEV(i).bdev));
+ return discard_granularity;
+}
+
static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
{
return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
@@ -4900,22 +4922,30 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
}
-static inline void f2fs_io_schedule_timeout(long timeout)
+static inline void __f2fs_schedule_timeout(long timeout, bool io)
{
set_current_state(TASK_UNINTERRUPTIBLE);
- io_schedule_timeout(timeout);
+ if (io)
+ io_schedule_timeout(timeout);
+ else
+ schedule_timeout(timeout);
}
+#define f2fs_io_schedule_timeout(timeout) \
+ __f2fs_schedule_timeout(timeout, true)
+#define f2fs_schedule_timeout(timeout) \
+ __f2fs_schedule_timeout(timeout, false)
+
static inline void f2fs_io_schedule_timeout_killable(long timeout)
{
while (timeout) {
if (fatal_signal_pending(current))
return;
set_current_state(TASK_UNINTERRUPTIBLE);
- io_schedule_timeout(DEFAULT_IO_TIMEOUT);
- if (timeout <= DEFAULT_IO_TIMEOUT)
+ io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ if (timeout <= DEFAULT_SCHEDULE_TIMEOUT)
return;
- timeout -= DEFAULT_IO_TIMEOUT;
+ timeout -= DEFAULT_SCHEDULE_TIMEOUT;
}
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ffa045b39c01..d7047ca6b98d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1654,8 +1654,11 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
- f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
- f2fs_update_age_extent_cache_range(dn, start, index - start);
+ if (index > start) {
+ f2fs_update_read_extent_cache_range(dn, start, 0,
+ index - start);
+ f2fs_update_age_extent_cache_range(dn, start, index - start);
+ }
return ret;
}
@@ -2125,8 +2128,9 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
f2fs_down_write(&fi->i_sem);
if (!f2fs_may_compress(inode) ||
- (S_ISREG(inode->i_mode) &&
- F2FS_HAS_BLOCKS(inode))) {
+ atomic_read(&fi->writeback) ||
+ (S_ISREG(inode->i_mode) &&
+ F2FS_HAS_BLOCKS(inode))) {
f2fs_up_write(&fi->i_sem);
return -EINVAL;
}
@@ -2584,14 +2588,14 @@ static int f2fs_keep_noreuse_range(struct inode *inode,
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct fstrim_range range;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!f2fs_hw_support_discard(F2FS_SB(sb)))
+ if (!f2fs_hw_support_discard(sbi))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
@@ -2602,9 +2606,9 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (ret)
return ret;
- range.minlen = max((unsigned int)range.minlen,
- bdev_discard_granularity(sb->s_bdev));
- ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ range.minlen = max_t(unsigned int, range.minlen,
+ f2fs_hw_discard_granularity(sbi));
+ ret = f2fs_trim_fs(sbi, &range);
mnt_drop_write_file(filp);
if (ret < 0)
return ret;
@@ -2612,7 +2616,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
}
@@ -5284,6 +5288,8 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
struct inode *inode = file_inode(filp);
int err;
+ trace_f2fs_fadvise(inode, offset, len, advice);
+
if (advice == POSIX_FADV_SEQUENTIAL) {
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a7708cf80c04..384fa7e2085b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -38,13 +38,14 @@ static int gc_thread_func(void *data)
struct f2fs_gc_control gc_control = {
.victim_segno = NULL_SEGNO,
.should_migrate_blocks = false,
- .err_gc_skipped = false };
+ .err_gc_skipped = false,
+ .one_time = false };
wait_ms = gc_th->min_sleep_time;
set_freezable();
do {
- bool sync_mode, foreground = false;
+ bool sync_mode, foreground = false, gc_boost = false;
wait_event_freezable_timeout(*wq,
kthread_should_stop() ||
@@ -52,8 +53,12 @@ static int gc_thread_func(void *data)
gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
- if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq))
+ if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) {
foreground = true;
+ gc_control.one_time = false;
+ } else if (f2fs_sb_has_blkzoned(sbi)) {
+ gc_control.one_time = true;
+ }
/* give it a try one time */
if (gc_th->gc_wake)
@@ -81,8 +86,6 @@ static int gc_thread_func(void *data)
continue;
}
- gc_control.one_time = false;
-
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
@@ -132,7 +135,7 @@ static int gc_thread_func(void *data)
if (need_to_boost_gc(sbi)) {
decrease_sleep_time(gc_th, &wait_ms);
if (f2fs_sb_has_blkzoned(sbi))
- gc_control.one_time = true;
+ gc_boost = true;
} else {
increase_sleep_time(gc_th, &wait_ms);
}
@@ -141,7 +144,7 @@ do_gc:
FOREGROUND : BACKGROUND);
sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) ||
- (gc_control.one_time && gc_th->boost_gc_greedy);
+ (gc_boost && gc_th->boost_gc_greedy);
/* foreground GC was been triggered via f2fs_balance_fs() */
if (foreground && !f2fs_sb_has_blkzoned(sbi))
@@ -771,7 +774,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct sit_info *sm = SIT_I(sbi);
- struct victim_sel_policy p;
+ struct victim_sel_policy p = {0};
unsigned int secno, last_victim;
unsigned int last_segment;
unsigned int nsearched;
@@ -1208,7 +1211,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = f2fs_is_cow_file(inode) ?
F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping;
struct dnode_of_data dn;
- struct folio *folio;
+ struct folio *folio, *efolio;
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1263,18 +1266,19 @@ got_it:
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
- fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
- dn.data_blkaddr,
+ efolio = f2fs_filemap_get_folio(META_MAPPING(sbi), dn.data_blkaddr,
FGP_LOCK | FGP_CREAT, GFP_NOFS);
- if (!fio.encrypted_page) {
- err = -ENOMEM;
+ if (IS_ERR(efolio)) {
+ err = PTR_ERR(efolio);
goto put_folio;
}
+ fio.encrypted_page = &efolio->page;
+
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_encrypted_page;
- f2fs_put_page(fio.encrypted_page, 0);
+ f2fs_put_page(fio.encrypted_page, false);
f2fs_folio_put(folio, true);
f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
@@ -1282,7 +1286,7 @@ got_it:
return 0;
put_encrypted_page:
- f2fs_put_page(fio.encrypted_page, 1);
+ f2fs_put_page(fio.encrypted_page, true);
put_folio:
f2fs_folio_put(folio, true);
return err;
@@ -1310,7 +1314,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- struct folio *folio, *mfolio;
+ struct folio *folio, *mfolio, *efolio;
block_t newaddr;
int err = 0;
bool lfs_mode = f2fs_lfs_mode(fio.sbi);
@@ -1404,14 +1408,16 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto up_out;
}
- fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
- newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
- if (!fio.encrypted_page) {
- err = -ENOMEM;
+ efolio = f2fs_filemap_get_folio(META_MAPPING(fio.sbi), newaddr,
+ FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(efolio)) {
+ err = PTR_ERR(efolio);
f2fs_folio_put(mfolio, true);
goto recover_block;
}
+ fio.encrypted_page = &efolio->page;
+
/* write target block */
f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
memcpy(page_address(fio.encrypted_page),
@@ -1436,7 +1442,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
- f2fs_put_page(fio.encrypted_page, 1);
+ f2fs_put_page(fio.encrypted_page, true);
recover_block:
if (err)
f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
@@ -1729,7 +1735,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE;
- int submitted = 0;
+ int submitted = 0, sum_blk_cnt;
if (__is_large_section(sbi)) {
sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
@@ -1763,22 +1769,28 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
+ segno = rounddown(segno, SUMS_PER_BLOCK);
+ sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK);
/* readahead multi ssa blocks those have contiguous address */
if (__is_large_section(sbi))
f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
- end_segno - segno, META_SSA, true);
+ sum_blk_cnt, META_SSA, true);
/* reference all summary page */
while (segno < end_segno) {
- struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno++);
+ struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno);
+
+ segno += SUMS_PER_BLOCK;
if (IS_ERR(sum_folio)) {
int err = PTR_ERR(sum_folio);
- end_segno = segno - 1;
- for (segno = start_segno; segno < end_segno; segno++) {
+ end_segno = segno - SUMS_PER_BLOCK;
+ segno = rounddown(start_segno, SUMS_PER_BLOCK);
+ while (segno < end_segno) {
sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
folio_put_refs(sum_folio, 2);
+ segno += SUMS_PER_BLOCK;
}
return err;
}
@@ -1787,68 +1799,83 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
blk_start_plug(&plug);
- for (segno = start_segno; segno < end_segno; segno++) {
- struct f2fs_summary_block *sum;
+ segno = start_segno;
+ while (segno < end_segno) {
+ unsigned int cur_segno;
/* find segment summary of victim */
struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
+ unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK)
+ + SUMS_PER_BLOCK;
+
+ if (block_end_segno > end_segno)
+ block_end_segno = end_segno;
if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) {
f2fs_err(sbi, "%s: segment %u is used by log",
__func__, segno);
f2fs_bug_on(sbi, 1);
- goto skip;
+ goto next_block;
}
- if (get_valid_blocks(sbi, segno, false) == 0)
- goto freed;
- if (gc_type == BG_GC && __is_large_section(sbi) &&
- migrated >= sbi->migration_granularity)
- goto skip;
if (!folio_test_uptodate(sum_folio) ||
unlikely(f2fs_cp_error(sbi)))
- goto skip;
+ goto next_block;
- sum = folio_address(sum_folio);
- if (type != GET_SUM_TYPE((&sum->footer))) {
- f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA",
- segno, type, GET_SUM_TYPE((&sum->footer)));
- f2fs_stop_checkpoint(sbi, false,
- STOP_CP_REASON_CORRUPTED_SUMMARY);
- goto skip;
- }
+ for (cur_segno = segno; cur_segno < block_end_segno;
+ cur_segno++) {
+ struct f2fs_summary_block *sum;
- /*
- * this is to avoid deadlock:
- * - lock_page(sum_page) - f2fs_replace_block
- * - check_valid_map() - down_write(sentry_lock)
- * - down_read(sentry_lock) - change_curseg()
- * - lock_page(sum_page)
- */
- if (type == SUM_TYPE_NODE)
- submitted += gc_node_segment(sbi, sum->entries, segno,
- gc_type);
- else
- submitted += gc_data_segment(sbi, sum->entries, gc_list,
- segno, gc_type,
- force_migrate);
+ if (get_valid_blocks(sbi, cur_segno, false) == 0)
+ goto freed;
+ if (gc_type == BG_GC && __is_large_section(sbi) &&
+ migrated >= sbi->migration_granularity)
+ continue;
- stat_inc_gc_seg_count(sbi, data_type, gc_type);
- sbi->gc_reclaimed_segs[sbi->gc_mode]++;
- migrated++;
+ sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno);
+ if (type != GET_SUM_TYPE((&sum->footer))) {
+ f2fs_err(sbi, "Inconsistent segment (%u) type "
+ "[%d, %d] in SSA and SIT",
+ cur_segno, type,
+ GET_SUM_TYPE((&sum->footer)));
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_CORRUPTED_SUMMARY);
+ continue;
+ }
-freed:
- if (gc_type == FG_GC &&
- get_valid_blocks(sbi, segno, false) == 0)
- seg_freed++;
+ /*
+ * this is to avoid deadlock:
+ * - lock_page(sum_page) - f2fs_replace_block
+ * - check_valid_map() - down_write(sentry_lock)
+ * - down_read(sentry_lock) - change_curseg()
+ * - lock_page(sum_page)
+ */
+ if (type == SUM_TYPE_NODE)
+ submitted += gc_node_segment(sbi, sum->entries,
+ cur_segno, gc_type);
+ else
+ submitted += gc_data_segment(sbi, sum->entries,
+ gc_list, cur_segno,
+ gc_type, force_migrate);
- if (__is_large_section(sbi))
- sbi->next_victim_seg[gc_type] =
- (segno + 1 < sec_end_segno) ?
- segno + 1 : NULL_SEGNO;
-skip:
+ stat_inc_gc_seg_count(sbi, data_type, gc_type);
+ sbi->gc_reclaimed_segs[sbi->gc_mode]++;
+ migrated++;
+
+freed:
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, cur_segno, false) == 0)
+ seg_freed++;
+
+ if (__is_large_section(sbi))
+ sbi->next_victim_seg[gc_type] =
+ (cur_segno + 1 < sec_end_segno) ?
+ cur_segno + 1 : NULL_SEGNO;
+ }
+next_block:
folio_put_refs(sum_folio, 2);
+ segno = block_end_segno;
}
if (submitted)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 24e8b1c27acc..6c4d4567571e 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -25,7 +25,7 @@
#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */
#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */
#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */
-#define DEF_GC_THREAD_VALID_THRESH_RATIO 95 /* do not GC over 95% valid block ratio for one time GC */
+#define DEF_GC_THREAD_VALID_THRESH_RATIO 80 /* do not GC over 80% valid block ratio for one time GC */
#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 58ac831ef704..e5c6a08b7e4f 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -287,7 +287,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio)
set_inode_flag(inode, FI_DATA_EXIST);
folio_clear_f2fs_inline(ifolio);
- f2fs_folio_put(ifolio, 1);
+ f2fs_folio_put(ifolio, true);
return 0;
}
@@ -577,7 +577,7 @@ recover:
f2fs_i_depth_write(dir, 0);
f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
folio_mark_dirty(ifolio);
- f2fs_folio_put(ifolio, 1);
+ f2fs_folio_put(ifolio, true);
kfree(backup_dentry);
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index f1cda1900658..38b8994bc1b2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -294,6 +294,12 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio)
return false;
}
+ if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) {
+ f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink",
+ __func__, inode->i_ino);
+ return false;
+ }
+
if (f2fs_has_extra_attr(inode)) {
if (!f2fs_sb_has_extra_attr(sbi)) {
f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off",
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index af40282a6948..043d20516a21 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -552,30 +552,31 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
- goto fail;
+ goto out;
}
err = f2fs_dquot_initialize(dir);
if (err)
- goto fail;
+ goto out;
err = f2fs_dquot_initialize(inode);
if (err)
- goto fail;
+ goto out;
de = f2fs_find_entry(dir, &dentry->d_name, &folio);
if (!de) {
if (IS_ERR(folio))
err = PTR_ERR(folio);
- goto fail;
+ goto out;
}
if (unlikely(inode->i_nlink == 0)) {
- f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink",
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has zero i_nlink",
__func__, inode->i_ino);
- err = -EFSCORRUPTED;
- set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
- f2fs_folio_put(folio, false);
- goto fail;
+ goto corrupted;
+ } else if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) {
+ f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink",
+ __func__, inode->i_ino);
+ goto corrupted;
}
f2fs_balance_fs(sbi, true);
@@ -585,7 +586,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (err) {
f2fs_unlock_op(sbi);
f2fs_folio_put(folio, false);
- goto fail;
+ goto out;
}
f2fs_delete_entry(de, folio, dir, inode);
f2fs_unlock_op(sbi);
@@ -601,7 +602,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
-fail:
+
+ goto out;
+corrupted:
+ err = -EFSCORRUPTED;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_folio_put(folio, false);
+out:
trace_f2fs_unlink_exit(inode, err);
return err;
}
@@ -1053,9 +1060,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (whiteout) {
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
- if (err)
+ if (err) {
+ d_invalidate(old_dentry);
+ d_invalidate(new_dentry);
goto put_out_dir;
-
+ }
spin_lock(&whiteout->i_lock);
inode_state_clear(whiteout, I_LINKABLE);
spin_unlock(&whiteout->i_lock);
@@ -1247,11 +1256,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return 0;
out_new_dir:
if (new_dir_entry) {
- f2fs_folio_put(new_dir_folio, 0);
+ f2fs_folio_put(new_dir_folio, false);
}
out_old_dir:
if (old_dir_entry) {
- f2fs_folio_put(old_dir_folio, 0);
+ f2fs_folio_put(old_dir_folio, false);
}
out_new:
f2fs_folio_put(new_folio, false);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 215e442db72c..c3415ebb9f50 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -399,7 +399,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr,
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
- bool check_only)
+ bool check_only, bool *new_inode)
{
struct curseg_info *curseg;
block_t blkaddr, blkaddr_fast;
@@ -447,16 +447,19 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
quota_inode = true;
}
- /*
- * CP | dnode(F) | inode(DF)
- * For this case, we should not give up now.
- */
entry = add_fsync_inode(sbi, head, ino_of_node(folio),
quota_inode);
if (IS_ERR(entry)) {
err = PTR_ERR(entry);
- if (err == -ENOENT)
+ /*
+ * CP | dnode(F) | inode(DF)
+ * For this case, we should not give up now.
+ */
+ if (err == -ENOENT) {
+ if (check_only)
+ *new_inode = true;
goto next;
+ }
f2fs_folio_put(folio, true);
break;
}
@@ -519,7 +522,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sum_folio = f2fs_get_sum_folio(sbi, segno);
if (IS_ERR(sum_folio))
return PTR_ERR(sum_folio);
- sum_node = folio_address(sum_folio);
+ sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno);
sum = sum_node->entries[blkoff];
f2fs_folio_put(sum_folio, true);
got_it:
@@ -869,12 +872,14 @@ next:
int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
- struct list_head inode_list, tmp_inode_list;
- struct list_head dir_list;
+ LIST_HEAD(inode_list);
+ LIST_HEAD(tmp_inode_list);
+ LIST_HEAD(dir_list);
int err;
int ret = 0;
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+ bool new_inode = false;
f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, "
"check_only: %d", check_only);
@@ -882,16 +887,12 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
f2fs_info(sbi, "recover fsync data on readonly fs");
- INIT_LIST_HEAD(&inode_list);
- INIT_LIST_HEAD(&tmp_inode_list);
- INIT_LIST_HEAD(&dir_list);
-
/* prevent checkpoint */
f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
- err = find_fsync_dnodes(sbi, &inode_list, check_only);
- if (err || list_empty(&inode_list))
+ err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode);
+ if (err < 0 || (list_empty(&inode_list) && (!check_only || !new_inode)))
goto skip;
if (check_only) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b45eace879d7..c26424f47686 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -234,7 +234,7 @@ retry:
err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
return err;
@@ -750,7 +750,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
do {
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
if (ret)
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
} while (ret && --count);
if (ret) {
@@ -1343,15 +1343,9 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->di.len += len;
+ err = 0;
if (time_to_inject(sbi, FAULT_DISCARD)) {
err = -EIO;
- } else {
- err = __blkdev_issue_discard(bdev,
- SECTOR_FROM_BLOCK(start),
- SECTOR_FROM_BLOCK(len),
- GFP_NOFS, &bio);
- }
- if (err) {
spin_lock_irqsave(&dc->lock, flags);
if (dc->state == D_PARTIAL)
dc->state = D_SUBMIT;
@@ -1360,6 +1354,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
break;
}
+ __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start),
+ SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio);
f2fs_bug_on(sbi, !bio);
/*
@@ -2712,7 +2708,15 @@ struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno)
void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
void *src, block_t blk_addr)
{
- struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
+ struct folio *folio;
+
+ if (SUMS_PER_BLOCK == 1)
+ folio = f2fs_grab_meta_folio(sbi, blk_addr);
+ else
+ folio = f2fs_get_meta_folio_retry(sbi, blk_addr);
+
+ if (IS_ERR(folio))
+ return;
memcpy(folio_address(folio), src, PAGE_SIZE);
folio_mark_dirty(folio);
@@ -2720,9 +2724,21 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
}
static void write_sum_page(struct f2fs_sb_info *sbi,
- struct f2fs_summary_block *sum_blk, block_t blk_addr)
+ struct f2fs_summary_block *sum_blk, unsigned int segno)
{
- f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr);
+ struct folio *folio;
+
+ if (SUMS_PER_BLOCK == 1)
+ return f2fs_update_meta_page(sbi, (void *)sum_blk,
+ GET_SUM_BLOCK(sbi, segno));
+
+ folio = f2fs_get_sum_folio(sbi, segno);
+ if (IS_ERR(folio))
+ return;
+
+ memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk));
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
static void write_current_sum_page(struct f2fs_sb_info *sbi,
@@ -2987,7 +3003,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
int ret;
if (curseg->inited)
- write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno));
+ write_sum_page(sbi, curseg->sum_blk, segno);
segno = __get_next_segno(sbi, type);
ret = get_new_segment(sbi, &segno, new_sec, pinning);
@@ -3046,7 +3062,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
struct folio *sum_folio;
if (curseg->inited)
- write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, curseg->segno);
__set_test_and_inuse(sbi, new_segno);
@@ -3065,7 +3081,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
return PTR_ERR(sum_folio);
}
- sum_node = folio_address(sum_folio);
+ sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
f2fs_folio_put(sum_folio, true);
return 0;
@@ -3154,8 +3170,7 @@ static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
goto out;
if (get_valid_blocks(sbi, curseg->segno, false)) {
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, curseg->segno);
} else {
mutex_lock(&DIRTY_I(sbi)->seglist_lock);
__set_test_and_free(sbi, curseg->segno, true);
@@ -3452,7 +3467,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ f2fs_schedule_timeout(DEFAULT_DISCARD_INTERVAL);
goto next;
}
skip:
@@ -3833,8 +3848,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
if (segment_full) {
if (type == CURSEG_COLD_DATA_PINNED &&
!((curseg->segno + 1) % sbi->segs_per_sec)) {
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, curseg->segno);
reset_curseg_fields(curseg);
goto skip_new_segment;
}
@@ -3863,8 +3877,13 @@ skip_new_segment:
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
- if (IS_DATASEG(curseg->seg_type))
- atomic64_inc(&sbi->allocated_data_blocks);
+ if (IS_DATASEG(curseg->seg_type)) {
+ unsigned long long new_val;
+
+ new_val = atomic64_inc_return(&sbi->allocated_data_blocks);
+ if (unlikely(new_val == ULLONG_MAX))
+ atomic64_set(&sbi->allocated_data_blocks, 0);
+ }
up_write(&sit_i->sentry_lock);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 1ce2c8abaf48..07dcbcbeb7c6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -69,11 +69,16 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
((!__is_valid_data_blkaddr(blk_addr)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
+#ifdef CONFIG_BLK_DEV_ZONED
#define CAP_BLKS_PER_SEC(sbi) \
(BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec)
#define CAP_SEGS_PER_SEC(sbi) \
(SEGS_PER_SEC(sbi) - \
BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec))
+#else
+#define CAP_BLKS_PER_SEC(sbi) BLKS_PER_SEC(sbi)
+#define CAP_SEGS_PER_SEC(sbi) SEGS_PER_SEC(sbi)
+#endif
#define GET_START_SEG_FROM_SEC(sbi, segno) \
(rounddown(segno, SEGS_PER_SEC(sbi)))
#define GET_SEC_FROM_SEG(sbi, segno) \
@@ -85,8 +90,12 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
-#define GET_SUM_BLOCK(sbi, segno) \
- ((sbi)->sm_info->ssa_blkaddr + (segno))
+#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE)
+#define GET_SUM_BLOCK(sbi, segno) \
+ (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK))
+#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK)
+#define SUM_BLK_PAGE_ADDR(folio, segno) \
+ (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE)
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
@@ -603,10 +612,12 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi,
enum log_type type, unsigned int segno)
{
- if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
- return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi,
- (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
+ if (f2fs_lfs_mode(sbi)) {
+ unsigned int used_blocks = __is_large_section(sbi) ? SEGS_TO_BLKS(sbi,
+ (segno - GET_START_SEG_FROM_SEC(sbi, segno))) : 0;
+ return CAP_BLKS_PER_SEC(sbi) - used_blocks -
CURSEG_I(sbi, type)->next_blkoff;
+ }
return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 47489d48f2b9..c4c225e09dc4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -352,7 +352,7 @@ static match_table_t f2fs_checkpoint_tokens = {
struct f2fs_fs_context {
struct f2fs_mount_info info;
- unsigned int opt_mask; /* Bits changed */
+ unsigned long long opt_mask; /* Bits changed */
unsigned int spec_mask;
unsigned short qname_mask;
};
@@ -360,23 +360,23 @@ struct f2fs_fs_context {
#define F2FS_CTX_INFO(ctx) ((ctx)->info)
static inline void ctx_set_opt(struct f2fs_fs_context *ctx,
- unsigned int flag)
+ enum f2fs_mount_opt flag)
{
- ctx->info.opt |= flag;
- ctx->opt_mask |= flag;
+ ctx->info.opt |= BIT(flag);
+ ctx->opt_mask |= BIT(flag);
}
static inline void ctx_clear_opt(struct f2fs_fs_context *ctx,
- unsigned int flag)
+ enum f2fs_mount_opt flag)
{
- ctx->info.opt &= ~flag;
- ctx->opt_mask |= flag;
+ ctx->info.opt &= ~BIT(flag);
+ ctx->opt_mask |= BIT(flag);
}
static inline bool ctx_test_opt(struct f2fs_fs_context *ctx,
- unsigned int flag)
+ enum f2fs_mount_opt flag)
{
- return ctx->info.opt & flag;
+ return ctx->info.opt & BIT(flag);
}
void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
@@ -1371,7 +1371,7 @@ static int f2fs_check_compression(struct fs_context *fc,
ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE))
f2fs_info(sbi, "Image doesn't support compression");
clear_compression_spec(ctx);
- ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE;
+ ctx->opt_mask &= ~BIT(F2FS_MOUNT_COMPRESS_CACHE);
return 0;
}
if (ctx->spec_mask & F2FS_SPEC_compress_extension) {
@@ -1439,42 +1439,42 @@ static int f2fs_check_opt_consistency(struct fs_context *fc,
return -EINVAL;
if (f2fs_hw_should_discard(sbi) &&
- (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) &&
!ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
f2fs_warn(sbi, "discard is required for zoned block devices");
return -EINVAL;
}
if (!f2fs_hw_support_discard(sbi) &&
- (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) &&
ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
f2fs_warn(sbi, "device does not support discard");
ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD);
- ctx->opt_mask &= ~F2FS_MOUNT_DISCARD;
+ ctx->opt_mask &= ~BIT(F2FS_MOUNT_DISCARD);
}
if (f2fs_sb_has_device_alias(sbi) &&
- (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) &&
+ (ctx->opt_mask & BIT(F2FS_MOUNT_READ_EXTENT_CACHE)) &&
!ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) {
f2fs_err(sbi, "device aliasing requires extent cache");
return -EINVAL;
}
if (test_opt(sbi, RESERVE_ROOT) &&
- (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) &&
+ (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_ROOT)) &&
ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) {
f2fs_info(sbi, "Preserve previous reserve_root=%u",
F2FS_OPTION(sbi).root_reserved_blocks);
ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
- ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT;
+ ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_ROOT);
}
if (test_opt(sbi, RESERVE_NODE) &&
- (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) &&
+ (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_NODE)) &&
ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) {
f2fs_info(sbi, "Preserve previous reserve_node=%u",
F2FS_OPTION(sbi).root_reserved_nodes);
ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE);
- ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE;
+ ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_NODE);
}
err = f2fs_check_test_dummy_encryption(fc, sb);
@@ -1759,6 +1759,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
atomic_set(&fi->open_count, 0);
+ atomic_set(&fi->writeback, 0);
init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
@@ -1988,14 +1989,6 @@ static void f2fs_put_super(struct super_block *sb)
truncate_inode_pages_final(META_MAPPING(sbi));
}
- for (i = 0; i < NR_COUNT_TYPE; i++) {
- if (!get_pages(sbi, i))
- continue;
- f2fs_err(sbi, "detect filesystem reference count leak during "
- "umount, type: %d, count: %lld", i, get_pages(sbi, i));
- f2fs_bug_on(sbi, 1);
- }
-
f2fs_bug_on(sbi, sbi->fsync_node_num);
f2fs_destroy_compress_inode(sbi);
@@ -2006,6 +1999,15 @@ static void f2fs_put_super(struct super_block *sb)
iput(sbi->meta_inode);
sbi->meta_inode = NULL;
+ /* Should check the page counts after dropping all node/meta pages */
+ for (i = 0; i < NR_COUNT_TYPE; i++) {
+ if (!get_pages(sbi, i))
+ continue;
+ f2fs_err(sbi, "detect filesystem reference count leak during "
+ "umount, type: %d, count: %lld", i, get_pages(sbi, i));
+ f2fs_bug_on(sbi, 1);
+ }
+
/*
* iput() can update stat information, if f2fs_write_checkpoint()
* above failed with error.
@@ -2026,7 +2028,6 @@ static void f2fs_put_super(struct super_block *sb)
kfree(sbi->raw_super);
f2fs_destroy_page_array_cache(sbi);
- f2fs_destroy_xattr_caches(sbi);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
@@ -2632,12 +2633,14 @@ restore_flag:
return err;
}
-static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
+static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16;
- long long start, writeback, end;
+ long long start, writeback, lock, sync_inode, end;
+ int ret;
- f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld",
+ f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld",
+ __func__,
get_pages(sbi, F2FS_DIRTY_META),
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DATA));
@@ -2649,18 +2652,25 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* we should flush all the data to keep data consistency */
while (get_pages(sbi, F2FS_DIRTY_DATA)) {
writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC);
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
if (f2fs_time_over(sbi, ENABLE_TIME))
break;
}
writeback = ktime_get();
- sync_inodes_sb(sbi->sb);
+ f2fs_down_write(&sbi->cp_enable_rwsem);
+
+ lock = ktime_get();
+
+ if (get_pages(sbi, F2FS_DIRTY_DATA))
+ sync_inodes_sb(sbi->sb);
if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA)))
- f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld",
- get_pages(sbi, F2FS_DIRTY_DATA));
+ f2fs_warn(sbi, "%s: has some unwritten data: %lld",
+ __func__, get_pages(sbi, F2FS_DIRTY_DATA));
+
+ sync_inode = ktime_get();
f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
@@ -2669,16 +2679,32 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
set_sbi_flag(sbi, SBI_IS_DIRTY);
f2fs_up_write(&sbi->gc_lock);
- f2fs_sync_fs(sbi->sb, 1);
+ f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld",
+ __func__,
+ get_pages(sbi, F2FS_DIRTY_META),
+ get_pages(sbi, F2FS_DIRTY_IMETA),
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DENTS),
+ get_pages(sbi, F2FS_DIRTY_QDATA));
+ ret = f2fs_sync_fs(sbi->sb, 1);
+ if (ret)
+ f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret);
/* Let's ensure there's no pending checkpoint anymore */
f2fs_flush_ckpt_thread(sbi);
+ f2fs_up_write(&sbi->cp_enable_rwsem);
+
end = ktime_get();
- f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu",
- ktime_ms_delta(writeback, start),
- ktime_ms_delta(end, writeback));
+ f2fs_info(sbi, "%s end, writeback:%llu, "
+ "lock:%llu, sync_inode:%llu, sync_fs:%llu",
+ __func__,
+ ktime_ms_delta(writeback, start),
+ ktime_ms_delta(lock, writeback),
+ ktime_ms_delta(sync_inode, lock),
+ ktime_ms_delta(end, sync_inode));
+ return ret;
}
static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
@@ -2892,7 +2918,9 @@ static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
goto restore_discard;
need_enable_checkpoint = true;
} else {
- f2fs_enable_checkpoint(sbi);
+ err = f2fs_enable_checkpoint(sbi);
+ if (err)
+ goto restore_discard;
need_disable_checkpoint = true;
}
}
@@ -2935,7 +2963,8 @@ skip:
return 0;
restore_checkpoint:
if (need_enable_checkpoint) {
- f2fs_enable_checkpoint(sbi);
+ if (f2fs_enable_checkpoint(sbi))
+ f2fs_warn(sbi, "checkpoint has not been enabled");
} else if (need_disable_checkpoint) {
if (f2fs_disable_checkpoint(sbi))
f2fs_warn(sbi, "checkpoint has not been disabled");
@@ -3110,7 +3139,7 @@ retry:
&folio, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -4051,6 +4080,20 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
if (sanity_check_area_boundary(sbi, folio, index))
return -EFSCORRUPTED;
+ /*
+ * Check for legacy summary layout on 16KB+ block devices.
+ * Modern f2fs-tools packs multiple 4KB summary areas into one block,
+ * whereas legacy versions used one block per summary, leading
+ * to a much larger SSA.
+ */
+ if (SUMS_PER_BLOCK > 1 &&
+ !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) {
+ f2fs_info(sbi, "Error: Device formatted with a legacy version. "
+ "Please reformat with a tool supporting the packed ssa "
+ "feature for block sizes larger than 4kb.");
+ return -EOPNOTSUPP;
+ }
+
return 0;
}
@@ -4544,50 +4587,9 @@ void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
spin_unlock_irqrestore(&sbi->error_lock, flags);
}
-static bool f2fs_update_errors(struct f2fs_sb_info *sbi)
-{
- unsigned long flags;
- bool need_update = false;
-
- spin_lock_irqsave(&sbi->error_lock, flags);
- if (sbi->error_dirty) {
- memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors,
- MAX_F2FS_ERRORS);
- sbi->error_dirty = false;
- need_update = true;
- }
- spin_unlock_irqrestore(&sbi->error_lock, flags);
-
- return need_update;
-}
-
-static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error)
-{
- int err;
-
- f2fs_down_write(&sbi->sb_lock);
-
- if (!f2fs_update_errors(sbi))
- goto out_unlock;
-
- err = f2fs_commit_super(sbi, false);
- if (err)
- f2fs_err_ratelimited(sbi,
- "f2fs_commit_super fails to record errors:%u, err:%d",
- error, err);
-out_unlock:
- f2fs_up_write(&sbi->sb_lock);
-}
-
void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error)
{
f2fs_save_errors(sbi, error);
- f2fs_record_errors(sbi, error);
-}
-
-void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error)
-{
- f2fs_save_errors(sbi, error);
if (!sbi->error_dirty)
return;
@@ -4904,6 +4906,7 @@ try_onemore:
init_f2fs_rwsem(&sbi->node_change);
spin_lock_init(&sbi->stat_lock);
init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->cp_enable_rwsem);
init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
spin_lock_init(&sbi->error_lock);
@@ -5015,13 +5018,9 @@ try_onemore:
if (err)
goto free_iostat;
- /* init per sbi slab cache */
- err = f2fs_init_xattr_caches(sbi);
- if (err)
- goto free_percpu;
err = f2fs_init_page_array_cache(sbi);
if (err)
- goto free_xattr_cache;
+ goto free_percpu;
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
@@ -5226,11 +5225,15 @@ try_onemore:
}
} else {
err = f2fs_recover_fsync_data(sbi, true);
-
- if (!f2fs_readonly(sb) && err > 0) {
- err = -EINVAL;
- f2fs_err(sbi, "Need to recover fsync data");
- goto free_meta;
+ if (err > 0) {
+ if (!f2fs_readonly(sb)) {
+ f2fs_err(sbi, "Need to recover fsync data");
+ err = -EINVAL;
+ goto free_meta;
+ } else {
+ f2fs_info(sbi, "drop all fsynced data");
+ err = 0;
+ }
}
}
@@ -5257,13 +5260,12 @@ reset_checkpoint:
if (err)
goto sync_free_meta;
- if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT))
err = f2fs_disable_checkpoint(sbi);
- if (err)
- goto sync_free_meta;
- } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
- f2fs_enable_checkpoint(sbi);
- }
+ else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))
+ err = f2fs_enable_checkpoint(sbi);
+ if (err)
+ goto sync_free_meta;
/*
* If filesystem is not mounted as read-only then
@@ -5350,8 +5352,6 @@ free_meta_inode:
sbi->meta_inode = NULL;
free_page_array_cache:
f2fs_destroy_page_array_cache(sbi);
-free_xattr_cache:
- f2fs_destroy_xattr_caches(sbi);
free_percpu:
destroy_percpu_info(sbi);
free_iostat:
@@ -5554,10 +5554,15 @@ static int __init init_f2fs_fs(void)
err = f2fs_create_casefold_cache();
if (err)
goto free_compress_cache;
- err = register_filesystem(&f2fs_fs_type);
+ err = f2fs_init_xattr_cache();
if (err)
goto free_casefold_cache;
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto free_xattr_cache;
return 0;
+free_xattr_cache:
+ f2fs_destroy_xattr_cache();
free_casefold_cache:
f2fs_destroy_casefold_cache();
free_compress_cache:
@@ -5598,6 +5603,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
unregister_filesystem(&f2fs_fs_type);
+ f2fs_destroy_xattr_cache();
f2fs_destroy_casefold_cache();
f2fs_destroy_compress_cache();
f2fs_destroy_compress_mempool();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6d2a4fba68a2..c42f4f979d13 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -235,6 +235,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_compression(sbi))
len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "compression");
+ if (f2fs_sb_has_packed_ssa(sbi))
+ len += sysfs_emit_at(buf, len, "%s%s",
+ len ? ", " : "", "packed_ssa");
len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "pin_file");
len += sysfs_emit_at(buf, len, "\n");
@@ -1210,6 +1213,7 @@ F2FS_SBI_GENERAL_RW_ATTR(last_age_weight);
F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count);
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
+F2FS_SBI_GENERAL_RO_ATTR(max_open_zones);
F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
F2FS_SBI_GENERAL_RW_ATTR(carve_out);
@@ -1296,6 +1300,7 @@ F2FS_FEATURE_RO_ATTR(pin_file);
#ifdef CONFIG_UNICODE
F2FS_FEATURE_RO_ATTR(linear_lookup);
#endif
+F2FS_FEATURE_RO_ATTR(packed_ssa);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -1384,6 +1389,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(unusable_blocks_per_sec),
+ ATTR_LIST(max_open_zones),
ATTR_LIST(blkzone_alloc_policy),
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1455,6 +1461,7 @@ static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_UNICODE
BASE_ATTR_LIST(linear_lookup),
#endif
+ BASE_ATTR_LIST(packed_ssa),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
@@ -1490,6 +1497,7 @@ F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS);
+F2FS_SB_FEATURE_RO_ATTR(packed_ssa, PACKED_SSA);
static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_encryption),
@@ -1507,6 +1515,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_compression),
ATTR_LIST(sb_readonly),
ATTR_LIST(sb_device_alias),
+ ATTR_LIST(sb_packed_ssa),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_sb_feat);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index f0ab9a3c7a82..05b935b55216 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -263,7 +263,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
- folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ folio = f2fs_filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 58632a2b6613..b4e5c406632f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -23,11 +23,12 @@
#include "xattr.h"
#include "segment.h"
+static struct kmem_cache *inline_xattr_slab;
static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
{
- if (likely(size == sbi->inline_xattr_slab_size)) {
+ if (likely(size == DEFAULT_XATTR_SLAB_SIZE)) {
*is_inline = true;
- return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab,
+ return f2fs_kmem_cache_alloc(inline_xattr_slab,
GFP_F2FS_ZERO, false, sbi);
}
*is_inline = false;
@@ -38,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
bool is_inline)
{
if (is_inline)
- kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
+ kmem_cache_free(inline_xattr_slab, xattr_addr);
else
kfree(xattr_addr);
}
@@ -830,25 +831,14 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
return err;
}
-int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi)
+int __init f2fs_init_xattr_cache(void)
{
- dev_t dev = sbi->sb->s_bdev->bd_dev;
- char slab_name[32];
-
- sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev));
-
- sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size *
- sizeof(__le32) + XATTR_PADDING_SIZE;
-
- sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name,
- sbi->inline_xattr_slab_size);
- if (!sbi->inline_xattr_slab)
- return -ENOMEM;
-
- return 0;
+ inline_xattr_slab = f2fs_kmem_cache_create("f2fs_xattr_entry",
+ DEFAULT_XATTR_SLAB_SIZE);
+ return inline_xattr_slab ? 0 : -ENOMEM;
}
-void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi)
+void f2fs_destroy_xattr_cache(void)
{
- kmem_cache_destroy(sbi->inline_xattr_slab);
-}
+ kmem_cache_destroy(inline_xattr_slab);
+} \ No newline at end of file
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 4fc0b2305fbd..bce3d93e4755 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -89,6 +89,8 @@ struct f2fs_xattr_entry {
F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \
DEF_INLINE_RESERVED_SIZE - \
MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+#define DEFAULT_XATTR_SLAB_SIZE (DEFAULT_INLINE_XATTR_ADDRS * \
+ sizeof(__le32) + XATTR_PADDING_SIZE)
/*
* On-disk structure of f2fs_xattr
@@ -132,8 +134,8 @@ int f2fs_setxattr(struct inode *, int, const char *, const void *,
int f2fs_getxattr(struct inode *, int, const char *, void *,
size_t, struct folio *);
ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
-int f2fs_init_xattr_caches(struct f2fs_sb_info *);
-void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
+int __init f2fs_init_xattr_cache(void);
+void f2fs_destroy_xattr_cache(void);
#else
#define f2fs_xattr_handlers NULL
@@ -150,8 +152,8 @@ static inline int f2fs_getxattr(struct inode *inode, int index,
{
return -EOPNOTSUPP;
}
-static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
-static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
+static inline int __init f2fs_init_xattr_cache(void) { return 0; }
+static inline void f2fs_destroy_xattr_cache(void) { }
#endif
#ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6afb4a13b81d..a7880787cad3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -17,6 +17,7 @@
#define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */
#define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */
#define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */
+#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */
#define F2FS_MAX_EXTENSION 64 /* # of extension entries */
#define F2FS_EXTENSION_LEN 8 /* max size of extension */
@@ -441,7 +442,7 @@ struct f2fs_sit_block {
* from node's page's beginning to get a data block address.
* ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
*/
-#define ENTRIES_IN_SUM (F2FS_BLKSIZE / 8)
+#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8)
#define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */
#define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM)
@@ -467,7 +468,7 @@ struct summary_footer {
__le32 check_sum; /* summary checksum */
} __packed;
-#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
+#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\
SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct nat_journal_entry))
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index edbbd869078f..df4017dcc701 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -50,6 +50,9 @@ TRACE_DEFINE_ENUM(CP_PAUSE);
TRACE_DEFINE_ENUM(CP_RESIZE);
TRACE_DEFINE_ENUM(EX_READ);
TRACE_DEFINE_ENUM(EX_BLOCK_AGE);
+TRACE_DEFINE_ENUM(CP_PHASE_START_BLOCK_OPS);
+TRACE_DEFINE_ENUM(CP_PHASE_FINISH_BLOCK_OPS);
+TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
#define show_block_type(type) \
__print_symbolic(type, \
@@ -175,6 +178,12 @@ TRACE_DEFINE_ENUM(EX_BLOCK_AGE);
#define S_ALL_PERM (S_ISUID | S_ISGID | S_ISVTX | \
S_IRWXU | S_IRWXG | S_IRWXO)
+#define show_cp_phase(phase) \
+ __print_symbolic(phase, \
+ { CP_PHASE_START_BLOCK_OPS, "start block_ops" }, \
+ { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \
+ { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" })
+
struct f2fs_sb_info;
struct f2fs_io_info;
struct extent_info;
@@ -204,7 +213,7 @@ DECLARE_EVENT_CLASS(f2fs__inode,
__entry->pino = F2FS_I(inode)->i_pino;
__entry->mode = inode->i_mode;
__entry->nlink = inode->i_nlink;
- __entry->size = inode->i_size;
+ __entry->size = i_size_read(inode);
__entry->blocks = inode->i_blocks;
__entry->advise = F2FS_I(inode)->i_advise;
),
@@ -353,7 +362,7 @@ TRACE_EVENT(f2fs_unlink_enter,
TP_fast_assign(
__entry->dev = dir->i_sb->s_dev;
__entry->ino = dir->i_ino;
- __entry->size = dir->i_size;
+ __entry->size = i_size_read(dir);
__entry->blocks = dir->i_blocks;
__assign_str(name);
),
@@ -433,7 +442,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->size = inode->i_size;
+ __entry->size = i_size_read(inode);
__entry->blocks = inode->i_blocks;
__entry->from = from;
),
@@ -586,6 +595,38 @@ TRACE_EVENT(f2fs_file_write_iter,
__entry->ret)
);
+TRACE_EVENT(f2fs_fadvise,
+
+ TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int advice),
+
+ TP_ARGS(inode, offset, len, advice),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(int, advice)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->size = i_size_read(inode);
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->advice = advice;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld offset:%llu, len:%llu, advise:%d",
+ show_dev_ino(__entry),
+ (unsigned long long)__entry->size,
+ __entry->offset,
+ __entry->len,
+ __entry->advice)
+);
+
TRACE_EVENT(f2fs_map_blocks,
TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag,
int ret),
@@ -1006,7 +1047,7 @@ TRACE_EVENT(f2fs_fallocate,
__entry->mode = mode;
__entry->offset = offset;
__entry->len = len;
- __entry->size = inode->i_size;
+ __entry->size = i_size_read(inode);
__entry->blocks = inode->i_blocks;
__entry->ret = ret;
),
@@ -1541,26 +1582,26 @@ TRACE_EVENT(f2fs_readpages,
TRACE_EVENT(f2fs_write_checkpoint,
- TP_PROTO(struct super_block *sb, int reason, const char *msg),
+ TP_PROTO(struct super_block *sb, int reason, u16 phase),
- TP_ARGS(sb, reason, msg),
+ TP_ARGS(sb, reason, phase),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, reason)
- __string(dest_msg, msg)
+ __field(u16, phase)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
__entry->reason = reason;
- __assign_str(dest_msg);
+ __entry->phase = phase;
),
TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
show_dev(__entry->dev),
show_cpreason(__entry->reason),
- __get_str(dest_msg))
+ show_cp_phase(__entry->phase))
);
DECLARE_EVENT_CLASS(f2fs_discard,
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 45d15460b495..34b31a855ea4 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get,
* io_uring_queue_async_work - called before submitting a new async work
*
* @req: pointer to a submitted request
- * @rw: type of workqueue, hashed or normal
+ * @hashed: whether async work is hashed
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
- TP_PROTO(struct io_kiocb *req, int rw),
+ TP_PROTO(struct io_kiocb *req, bool hashed),
- TP_ARGS(req, rw),
+ TP_ARGS(req, hashed),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( u8, opcode )
__field( unsigned long long, flags )
__field( struct io_wq_work *, work )
- __field( int, rw )
+ __field( bool, hashed )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
@@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->flags = (__force unsigned long long) req->flags;
__entry->opcode = req->opcode;
__entry->work = &req->work;
- __entry->rw = rw;
+ __entry->hashed = hashed;
__assign_str(op_str);
),
@@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work,
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
__get_str(op_str), __entry->flags,
- __entry->rw ? "hashed" : "normal", __entry->work)
+ __entry->hashed ? "hashed" : "normal", __entry->work)
);
/**
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 1d03b2fc4b25..cd13d8aac3d2 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -805,11 +805,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
*/
if (fatal_signal_pending(current))
return false;
- if (worker->init_retries++ >= WORKER_INIT_LIMIT)
- return false;
+ worker->init_retries++;
switch (err) {
case -EAGAIN:
+ return worker->init_retries <= WORKER_INIT_LIMIT;
+ /* Analogous to a fork() syscall, always retry on a restartable error */
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 8a329556f8df..796d131107dd 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -44,11 +44,11 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
buf_len -= this_len;
/* Stop looping for invalid buffer length of 0 */
if (buf_len || !this_len) {
- buf->addr += this_len;
- buf->len = buf_len;
+ WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
+ WRITE_ONCE(buf->len, buf_len);
return false;
}
- buf->len = 0;
+ WRITE_ONCE(buf->len, 0);
bl->head++;
len -= this_len;
}
@@ -198,9 +198,9 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
if (*len == 0 || *len > buf_len)
*len = buf_len;
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
- req->buf_index = buf->bid;
+ req->buf_index = READ_ONCE(buf->bid);
sel.buf_list = bl;
- sel.addr = u64_to_user_ptr(buf->addr);
+ sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
if (io_should_commit(req, issue_flags)) {
io_kbuf_commit(req, sel.buf_list, *len, 1);
@@ -280,7 +280,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
if (!arg->max_len)
arg->max_len = INT_MAX;
- req->buf_index = buf->bid;
+ req->buf_index = READ_ONCE(buf->bid);
do {
u32 len = READ_ONCE(buf->len);
@@ -291,11 +291,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
arg->partial_map = 1;
if (iov != arg->iovs)
break;
- buf->len = len;
+ WRITE_ONCE(buf->len, len);
}
}
- iov->iov_base = u64_to_user_ptr(buf->addr);
+ iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr));
iov->iov_len = len;
iov++;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8aa4e3a31e73..aac4b3b881fb 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
}
+static void io_poll_remove_waitq(struct io_poll *poll)
+{
+ /*
+ * If the waitqueue is being freed early but someone is already holds
+ * ownership over it, we have to tear down the request as best we can.
+ * That means immediately removing the request from its waitqueue and
+ * preventing all further accesses to the waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon as req->head is
+ * NULL'ed out, the request can be completed and freed, since
+ * io_poll_remove_entry() will no longer need to take the waitqueue
+ * lock.
+ */
+ smp_store_release(&poll->head, NULL);
+}
+
static inline void io_poll_remove_entry(struct io_poll *poll)
{
struct wait_queue_head *head = smp_load_acquire(&poll->head);
if (head) {
spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
+ io_poll_remove_waitq(poll);
spin_unlock_irq(&head->lock);
}
}
@@ -368,23 +386,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
io_poll_mark_cancelled(req);
/* we have to kick tw in case it's not already */
io_poll_execute(req, 0);
-
- /*
- * If the waitqueue is being freed early but someone is already
- * holds ownership over it, we have to tear down the request as
- * best we can. That means immediately removing the request from
- * its waitqueue and preventing all further accesses to the
- * waitqueue via the request.
- */
- list_del_init(&poll->wait.entry);
-
- /*
- * Careful: this *must* be the last step, since as soon
- * as req->head is NULL'ed out, the request can be
- * completed and freed, since aio_poll_complete_work()
- * will no longer need to take the waitqueue lock.
- */
- smp_store_release(&poll->head, NULL);
+ io_poll_remove_waitq(poll);
return 1;
}
@@ -413,8 +415,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
+ io_poll_remove_waitq(poll);
if (wqe_is_double(wait))
req->flags &= ~REQ_F_DOUBLE_POLL;
else
@@ -937,12 +938,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
/* successfully updated, don't complete poll request */
- if (!ret2 || ret2 == -EIOCBQUEUED)
+ if (ret2 == IOU_ISSUE_SKIP_COMPLETE)
goto out;
+ /* request completed as part of the update, complete it */
+ else if (ret2 == IOU_COMPLETE)
+ goto complete;
}
- req_set_fail(preq);
io_req_set_res(preq, -ECANCELED, 0);
+complete:
+ if (preq->cqe.res < 0)
+ req_set_fail(preq);
preq->io_task_work.func = io_req_task_complete;
io_req_task_work_add(preq);
out:
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 3765a50329a8..a63474b331bf 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1186,12 +1186,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
return -EBUSY;
nbufs = src_ctx->buf_table.nr;
+ if (!nbufs)
+ return -ENXIO;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
return -EINVAL;
else if (arg->nr > IORING_MAX_REG_BUFFERS)
return -EINVAL;
+ if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
+ return -EOVERFLOW;
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
if (nbufs > IORING_MAX_REG_BUFFERS)
@@ -1201,31 +1205,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (ret)
return ret;
- /* Fill entries in data from dst that won't overlap with src */
+ /* Copy original dst nodes from before the cloned range */
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
- struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+ struct io_rsrc_node *node = ctx->buf_table.nodes[i];
- if (src_node) {
- data.nodes[i] = src_node;
- src_node->refs++;
+ if (node) {
+ data.nodes[i] = node;
+ node->refs++;
}
}
- ret = -ENXIO;
- nbufs = src_ctx->buf_table.nr;
- if (!nbufs)
- goto out_free;
- ret = -EINVAL;
- if (!arg->nr)
- arg->nr = nbufs;
- else if (arg->nr > nbufs)
- goto out_free;
- ret = -EOVERFLOW;
- if (check_add_overflow(arg->nr, arg->src_off, &off))
- goto out_free;
- if (off > nbufs)
- goto out_free;
-
off = arg->dst_off;
i = arg->src_off;
nr = arg->nr;
@@ -1238,8 +1227,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
} else {
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!dst_node) {
- ret = -ENOMEM;
- goto out_free;
+ io_rsrc_data_free(ctx, &data);
+ return -ENOMEM;
}
refcount_inc(&src_node->buf->refs);
@@ -1249,6 +1238,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
i++;
}
+ /* Copy original dst nodes from after the cloned range */
+ for (i = nbufs; i < ctx->buf_table.nr; i++) {
+ struct io_rsrc_node *node = ctx->buf_table.nodes[i];
+
+ if (node) {
+ data.nodes[i] = node;
+ node->refs++;
+ }
+ }
+
/*
* If asked for replace, put the old table. data->nodes[] holds both
* old and new nodes at this point.
@@ -1265,10 +1264,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
WARN_ON_ONCE(ctx->buf_table.nr);
ctx->buf_table = data;
return 0;
-
-out_free:
- io_rsrc_data_free(ctx, &data);
- return ret;
}
/*