From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:36 +1100 Subject: fs: protect inode->i_state with inode->i_lock Protect inode state transitions and validity checks with the inode->i_lock. This enables us to make inode state transitions independently of the inode_lock and is the first step to peeling away the inode_lock from the code. This requires that __iget() is done atomically with i_state checks during list traversals so that we don't race with another thread marking the inode I_FREEING between the state check and grabbing the reference. Also remove the unlock_new_inode() memory barrier optimisation required to avoid taking the inode_lock when clearing I_NEW. Simplify the code by simply taking the inode->i_lock around the state change and wakeup. Because the wakeup is no longer tricky, remove the wake_up_inode() function and open code the wakeup where necessary. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/fs-writeback.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c6e4956786..efd1ebe879cc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -306,10 +306,12 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); } } @@ -333,6 +335,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; + spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -348,6 +351,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { + spin_unlock(&inode->i_lock); requeue_io(inode); return 0; } @@ -363,6 +367,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -384,8 +389,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { @@ -395,6 +402,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -436,6 +444,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + spin_unlock(&inode->i_lock); return ret; } @@ -506,7 +515,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -515,10 +526,14 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } __iget(inode); + spin_unlock(&inode->i_lock); + pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); if (wbc->pages_skipped != pages_skipped) { @@ -724,7 +739,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } spin_unlock(&inode_lock); } @@ -1017,6 +1034,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) block_dump___mark_inode_dirty(inode); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1028,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) - goto out; + goto out_unlock_inode; /* * Only add valid (hashed) inodes to the superblock's @@ -1036,11 +1054,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out; + goto out_unlock_inode; } if (inode->i_state & I_FREEING) - goto out; + goto out_unlock_inode; + spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). @@ -1065,7 +1084,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } + goto out; } +out_unlock_inode: + spin_unlock(&inode->i_lock); out: spin_unlock(&inode_lock); @@ -1111,14 +1133,16 @@ static void wait_sb_inodes(struct super_block *sb) * we still have to wait for that writeout. */ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + struct address_space *mapping = inode->i_mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* * We hold a reference to 'inode' so it couldn't have -- cgit From 55fa6091d83160ca772fc37cebae45d42695a708 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:40 +1100 Subject: fs: move i_sb_list out from under inode_lock Protect the per-sb inode list with a new global lock inode_sb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/fs-writeback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index efd1ebe879cc..5de56a2182bb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1123,7 +1123,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -1143,14 +1143,15 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. */ iput(old_inode); old_inode = inode; @@ -1159,9 +1160,9 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); } -- cgit From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:41 +1100 Subject: fs: move i_wb_list out from under inode_lock Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/fs-writeback.c | 76 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5de56a2182bb..ed800656356b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) spin_unlock_bh(&bdi->wb_lock); } +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either + * the caller has an active reference on the inode or the inode has I_WILL_FREE + * set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode->i_state & I_FREEING) goto out_unlock_inode; - spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } - goto out; } out_unlock_inode: spin_unlock(&inode->i_lock); -out: - spin_unlock(&inode_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); -- cgit From 0f1b1fd86f6fd662e04da3e82a6780b226fcd0d1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:43 +1100 Subject: fs: pull inode->i_lock up out of writeback_single_inode First thing we do in writeback_single_inode() is take the i_lock and the last thing we do is drop it. A caller already holds the i_lock, so pull the i_lock out of writeback_single_inode() to reduce the round trips on this lock during inode writeback. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/fs-writeback.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ed800656356b..b5ed541fb137 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -332,9 +332,9 @@ static void inode_wait_for_writeback(struct inode *inode) } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either - * the caller has an active reference on the inode or the inode has I_WILL_FREE - * set. + * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * inode->i_lock. Either the caller has an active reference on the inode or + * the inode has I_WILL_FREE set. * * If `wait' is set, wait on the writeout. * @@ -349,7 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; - spin_lock(&inode->i_lock); + assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&inode->i_lock); + if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -365,7 +367,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - spin_unlock(&inode->i_lock); requeue_io(inode); return 0; } @@ -456,7 +457,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); - spin_unlock(&inode->i_lock); return ret; } @@ -544,7 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, } __iget(inode); - spin_unlock(&inode->i_lock); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); @@ -555,6 +554,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } + spin_unlock(&inode->i_lock); spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); @@ -1309,7 +1309,9 @@ int write_inode_now(struct inode *inode, int sync) might_sleep(); spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, &wbc); + spin_unlock(&inode->i_lock); spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); @@ -1333,7 +1335,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) int ret; spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, wbc); + spin_unlock(&inode->i_lock); spin_unlock(&inode_wb_list_lock); return ret; } -- cgit