From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 15 Dec 2005 14:28:17 -0800 Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE readpage(), prepare_write(), and commit_write() callers are updated to understand the special return code AOP_TRUNCATED_PAGE in the style of writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that the callee has unlocked the page and that the operation should be tried again with a new page. OCFS2 uses this to detect and work around a lock inversion in its aop methods. There should be no change in behaviour for methods that don't return AOP_TRUNCATED_PAGE. WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are made enums so that kerneldoc can be used to document their semantics. Signed-off-by: Zach Brown --- mm/filemap.c | 73 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 23 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde158..6e1d08a2b8b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -831,8 +831,13 @@ readpage: /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); - if (unlikely(error)) + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } goto readpage_error; + } if (!PageUptodate(page)) { lock_page(page); @@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; struct page *page; - int error; + int ret; - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ - error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (!error) { - error = mapping->a_ops->readpage(file, page); page_cache_release(page); - return error; - } - /* - * We arrive here in the unlikely event that someone - * raced with us and added our page to the cache first - * or we are out of memory for radix-tree nodes. - */ - page_cache_release(page); - return error == -EEXIST ? 0 : error; + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; } #define MMAP_LOTSAMISS (100) @@ -1331,10 +1334,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1358,10 +1365,14 @@ page_not_uptodate: goto success; } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1444,10 +1455,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1470,10 +1485,14 @@ page_not_uptodate: } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + continue; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ - unlock_page(page); - page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; @@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + continue; + } if (likely(copied > 0)) { if (!status) status = copied; -- cgit From bbfbb7cec9dd7266534b2b4b9c8be2fa425bbfc9 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] find_lock_page(): call __lock_page() directly. As find_lock_page() already checks with TestSetPageLocked() that page is locked, there is no need to call lock_page() that will try-lock page again (chances of page being unlocked in between are small). Call __lock_page() directly, this saves one atomic operation. Also, mark truncate-while-slept path as unlikely while we are here. (akpm: ug. But this is actually a common path for normal old read()s against a page which is under readahead I/O so ho-hum.) Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 6e1d08a2b8b9..4ef24a397684 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -555,11 +555,12 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); - lock_page(page); + __lock_page(page); read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (unlikely(page->mapping != mapping || + page->index != offset)) { unlock_page(page); page_cache_release(page); goto repeat; -- cgit From 268fc16e343b4f8e249468747db2e658da46a814 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 8 Jan 2006 01:02:12 -0800 Subject: [PATCH] export/change sync_page_range/_nolock() This exports/changes the sync_page_range/_nolock(). The fatfs needs sync_page_range/_nolock() for expanding truncate, and changes "size_t count" to "loff_t count". Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 4ef24a397684..8fdf36508023 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping, * it is otherwise livelockable. */ int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, size_t count) + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range); * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ -static int sync_page_range_nolock(struct inode *inode, - struct address_space *mapping, - loff_t pos, size_t count) +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode, ret = wait_on_page_writeback_range(mapping, start, end); return ret; } +EXPORT_SYMBOL(sync_page_range_nolock); /** * filemap_fdatawait - walk the list of under-writeback pages of the given -- cgit From 28fd129827b00e12829d48a5290f46277600619b Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 8 Jan 2006 01:02:14 -0800 Subject: [PATCH] Fix and add EXPORT_SYMBOL(filemap_write_and_wait) This patch add EXPORT_SYMBOL(filemap_write_and_wait) and use it. See mm/filemap.c: And changes the filemap_write_and_wait() and filemap_write_and_wait_range(). Current filemap_write_and_wait() doesn't wait if filemap_fdatawrite() returns error. However, even if filemap_fdatawrite() returned an error, it may have submitted the partially data pages to the device. (e.g. in the case of -ENOSPC) Andrew Morton writes, If filemap_fdatawrite() returns an error, this might be due to some I/O problem: dead disk, unplugged cable, etc. Given the generally crappy quality of the kernel's handling of such exceptions, there's a good chance that the filemap_fdatawait() will get stuck in D state forever. So, this patch doesn't wait if filemap_fdatawrite() returns the -EIO. Trond, could you please review the nfs part? Especially I'm not sure, nfs must use the "filemap_fdatawrite(inode->i_mapping) == 0", or not. Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 8fdf36508023..478f4c74cc31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } } - return retval; + return err; } +EXPORT_SYMBOL(filemap_write_and_wait); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); - if (retval == 0) - retval = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + if (!err) + err = err2; + } } - return retval; + return err; } /* -- cgit From 1b1dcc1b57a49136f118a0f16367256ff9994a69 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Mon, 9 Jan 2006 15:59:24 -0800 Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, ->i_sem This patch converts the inode semaphore to a mutex. I have tested it on XFS and compiled as much as one can consider on an ia64. Anyway your luck with it might be different. Modified-by: Ingo Molnar (finished the conversion) Signed-off-by: Jes Sorensen Signed-off-by: Ingo Molnar --- mm/filemap.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 478f4c74cc31..5fca2737c971 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -61,7 +61,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * - * ->i_sem + * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) * * ->mmap_sem @@ -73,9 +73,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->lock_page (access_process_vm) * * ->mmap_sem - * ->i_sem (msync) + * ->i_mutex (msync) * - * ->i_sem + * ->i_mutex * ->i_alloc_sem (various) * * ->inode_lock @@ -276,7 +276,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping, * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. * - * We need to re-take i_sem during the generic_osync_inode list walk because + * We need to re-take i_mutex during the generic_osync_inode list walk because * it is otherwise livelockable. */ int sync_page_range(struct inode *inode, struct address_space *mapping, @@ -290,9 +290,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, return 0; ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); if (ret == 0) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } if (ret == 0) ret = wait_on_page_writeback_range(mapping, start, end); @@ -301,7 +301,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /* - * Note: Holding i_sem across sync_page_range_nolock is not a good idea + * Note: Holding i_mutex across sync_page_range_nolock is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ @@ -1892,7 +1892,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. - * i_sem is held, which protects generic_osync_inode() from + * i_mutex is held, which protects generic_osync_inode() from * livelocking. */ if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2195,10 +2195,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2220,9 +2220,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf, struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2256,9 +2256,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, struct inode *inode = mapping->host; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err; @@ -2272,7 +2272,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, EXPORT_SYMBOL(generic_file_writev); /* - * Called under i_sem for writes to S_ISREG files. Returns -EIO if something + * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. */ static ssize_t -- cgit From 870f481793b585323fbda3e87c54efc116f46351 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2006 20:52:01 -0800 Subject: [PATCH] replace inode_update_time with file_update_time To allow various options to work per-mount instead of per-sb we need a struct vfsmount when updating ctime and mtime. This preparation patch replaces the inode_update_time routine with a file_update_atime routine so we can easily get at the vfsmount. (and the file makes more sense in this context anyway). Also get rid of the unused second argument - we always want to update the ctime when calling this routine. Signed-off-by: Christoph Hellwig Cc: Al Viro Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5fca2737c971..96de772be487 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2108,7 +2108,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - inode_update_time(inode, 1); + file_update_time(file); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { -- cgit From c59ede7b78db329949d9cdcd7064e22d357560ef Mon Sep 17 00:00:00 2001 From: "Randy.Dunlap" Date: Wed, 11 Jan 2006 12:17:46 -0800 Subject: [PATCH] move capable() to capability.h - Move capable() from sched.h to capability.h; - Use where capable() is used (in include/, block/, ipc/, kernel/, a few drivers/, mm/, security/, & sound/; many more drivers/ to go) Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 96de772be487..a965b6b35f26 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include -- cgit From 053837fce7aa79025ed57656855df09f80175527 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Jan 2006 17:42:27 -0800 Subject: [PATCH] mm: migration page refcounting fix Migration code currently does not take a reference to target page properly, so between unlocking the pte and trying to take a new reference to the page with isolate_lru_page, anything could happen to it. Fix this by holding the pte lock until we get a chance to elevate the refcount. Other small cleanups while we're here. Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index a965b6b35f26..44da3d476994 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) -- cgit