From e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Apr 2025 17:12:59 +0200 Subject: writeback: Avoid contention on wb->list_lock when switching inodes There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo Signed-off-by: Jan Kara --- fs/fs-writeback.c | 99 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 36 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..b0e9092ccf04 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -486,13 +486,11 @@ skip_switch: return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return restart; } -- cgit From 66c14dccd810d42ec5c73bb8a9177489dfd62278 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:36 +0200 Subject: writeback: Avoid softlockup when switching many inodes process_inode_switch_wbs_work() can be switching over 100 inodes to a different cgroup. Since switching an inode requires counting all dirty & under-writeback pages in the address space of each inode, this can take a significant amount of time. Add a possibility to reschedule after processing each inode to avoid softlockups. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b0e9092ccf04..36ef1a796d4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -500,6 +500,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, */ down_read(&bdi->wb_switch_rwsem); + inodep = isw->inodes; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -510,6 +511,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ +relock: if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); @@ -518,10 +520,17 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } - for (inodep = isw->inodes; *inodep; inodep++) { + while (*inodep) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; + inodep++; + if (*inodep && need_resched()) { + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + cond_resched(); + goto relock; + } } spin_unlock(&new_wb->list_lock); -- cgit From 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:37 +0200 Subject: writeback: Avoid excessively long inode switching times With lazytime mount option enabled we can be switching many dirty inodes on cgroup exit to the parent cgroup. The numbers observed in practice when systemd slice of a large cron job exits can easily reach hundreds of thousands or millions. The logic in inode_do_switch_wbs() which sorts the inode into appropriate place in b_dirty list of the target wb however has linear complexity in the number of dirty inodes thus overall time complexity of switching all the inodes is quadratic leading to workers being pegged for hours consuming 100% of the CPU and switching inodes to the parent wb. Simple reproducer of the issue: FILES=10000 # Filesystem mounted with lazytime mount option MNT=/mnt/ echo "Creating files and switching timestamps" for (( j = 0; j < 50; j ++ )); do mkdir $MNT/dir$j for (( i = 0; i < $FILES; i++ )); do echo "foo" >$MNT/dir$j/file$i done touch -a -t 202501010000 $MNT/dir$j/file* done wait echo "Syncing and flushing" sync echo 3 >/proc/sys/vm/drop_caches echo "Reading all files from a cgroup" mkdir /sys/fs/cgroup/unified/mycg1 || exit echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit for (( j = 0; j < 50; j ++ )); do cat /mnt/dir$j/file* >/dev/null & done wait echo "Switching wbs" # Now rmdir the cgroup after the script exits We need to maintain b_dirty list ordering to keep writeback happy so instead of sorting inode into appropriate place just append it at the end of the list and clobber dirtied_time_when. This may result in inode writeback starting later after cgroup switch however cgroup switches are rare so it shouldn't matter much. Since the cgroup had write access to the inode, there are no practical concerns of the possible DoS issues. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36ef1a796d4b..af5f396449f1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode, * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. - * The transfer preserves @inode->dirtied_when ordering. If the @inode - * was clean, it means it was on the b_attached list, so move it onto - * the b_attached list of @new_wb. + * If the @inode was clean, it means it was on the b_attached list, so + * move it onto the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { - struct inode *pos; - - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; + /* + * We need to keep b_dirty list sorted by + * dirtied_time_when. However properly sorting the + * inode in the list gets too expensive when switching + * many inodes. So just attach inode at the end of the + * dirty list and clobber the dirtied_time_when. + */ + inode->dirtied_time_when = jiffies; inode_io_list_move_locked(inode, new_wb, - pos->i_io_list.prev); + &new_wb->b_dirty); } else { inode_cgwb_move_to_attached(inode, new_wb); } -- cgit From 0cee64c547e3c9cda646af3e075a64f445ee8148 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:38 +0200 Subject: writeback: Add tracepoint to track pending inode switches Add trace_inode_switch_wbs_queue tracepoint to allow insight into how many inodes are queued to switch their bdi_writeback structure. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index af5f396449f1..52129267e3bd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -667,6 +667,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inodes[0] = inode; + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); wb_queue_isw(new_wb, isw); return; @@ -752,6 +753,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) return restart; } + trace_inode_switch_wbs_queue(wb, new_wb, nr); wb_queue_isw(new_wb, isw); return restart; -- cgit