From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:04 +0100 Subject: buffer: pass GFP flags to folio_alloc_buffers() Patch series "Add and use bdev_getblk()", v2. This patch series fixes a bug reported by Hui Zhu; see proposed patches v1 and v2: https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/ https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/ I decided to go in a rather different direction for this fix, and fix a related problem at the same time. I don't think there's any urgency to rush this into Linus' tree, nor have I marked it for stable. Reasonable people may disagree. This patch (of 8): Instead of creating entirely new flags, inherit them from grow_dev_page(). The other callers create the same flags that this function used to create. Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..32338b7cfeb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -915,16 +915,12 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry) + gfp_t gfp) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; struct mem_cgroup *memcg, *old_memcg; - if (retry) - gfp |= __GFP_NOFAIL; - /* The folio lock pins the memcg */ memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); @@ -967,7 +963,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { - return folio_alloc_buffers(page_folio(page), size, retry); + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; + if (retry) + gfp |= __GFP_NOFAIL; + + return folio_alloc_buffers(page_folio(page), size, gfp); } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1069,7 +1069,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, true); + bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); /* * Link the folio to the buffers and initialise them. Take the @@ -1644,8 +1644,9 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; - head = folio_alloc_buffers(folio, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, gfp); bh = head; do { bh->b_state |= b_state; -- cgit From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:05 +0100 Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp() grow_dev_page() is only called by grow_buffers(). grow_buffers() is only called by __getblk_slow() and __getblk_slow() is only called from __getblk_gfp(), so it is safe to move the GFP flags setting all the way up. With that done, add a new bdev_getblk() entry point that leaves the GFP flags the way the caller specified them. [willy@infradead.org: fix grow_dev_page() error handling] Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/buffer.c | 61 ++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 21 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 32338b7cfeb9..80e96c1fcd33 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1043,20 +1043,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; - gfp_t gfp_mask; - - gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; - - /* - * XXX: __getblk_slow() can not really deal with failure and - * will endlessly loop on improvised global reclaim. Prefer - * looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp_mask |= __GFP_NOFAIL; folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); bh = folio_buffers(folio); if (bh) { @@ -1069,7 +1060,10 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); + ret = -ENOMEM; + bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); + if (!bh) + goto failed; /* * Link the folio to the buffers and initialise them. Take the @@ -1420,24 +1414,49 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__find_get_block); +/** + * bdev_getblk - Get a buffer_head in a block device's buffer cache. + * @bdev: The block device. + * @block: The block number. + * @size: The size of buffer_heads for this @bdev. + * @gfp: The memory allocation flags to use. + * + * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; + * they are not augmented with the mapping's GFP flags. + * + * Return: The buffer head, or NULL if memory could not be allocated. + */ +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_alloc(gfp); + if (bh) + return bh; + + return __getblk_slow(bdev, block, size, gfp); +} +EXPORT_SYMBOL(bdev_getblk); + /* * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. - * - * __getblk_gfp() will lock up the machine if grow_dev_page's - * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * __getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - might_sleep(); - if (bh == NULL) - bh = __getblk_slow(bdev, block, size, gfp); - return bh; + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + return bdev_getblk(bdev, block, size, gfp); } EXPORT_SYMBOL(__getblk_gfp); -- cgit From 775d9b10530a0a303dc08a9cdb2ed1f8667d9c5f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:07 +0100 Subject: buffer: use bdev_getblk() to avoid memory reclaim in readahead path __getblk() adds __GFP_NOFAIL, which is unnecessary for readahead; we're quite comfortable with the possibility that we may not get a bh back. Switch to bdev_getblk() which does not include __GFP_NOFAIL. Link: https://lkml.kernel.org/r/20230914150011.843330-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 80e96c1fcd33..edd118594565 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1465,7 +1465,9 @@ EXPORT_SYMBOL(__getblk_gfp); */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = bdev_getblk(bdev, block, size, + GFP_NOWAIT | __GFP_MOVABLE); + if (likely(bh)) { bh_readahead(bh, REQ_RAHEAD); brelse(bh); -- cgit From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:11 +0100 Subject: buffer: remove __getblk_gfp() Inline it into __bread_gfp(). Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index edd118594565..edec8652788c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1421,9 +1421,6 @@ EXPORT_SYMBOL(__find_get_block); * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * - * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; - * they are not augmented with the mapping's GFP flags. - * * Return: The buffer head, or NULL if memory could not be allocated. */ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, @@ -1439,27 +1436,6 @@ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(bdev_getblk); -/* - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head - * which corresponds to the passed block_device, block and size. The - * returned buffer has its reference count incremented. - */ -struct buffer_head * -__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp) -{ - gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - - /* - * Prefer looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp |= __GFP_NOFAIL; - - return bdev_getblk(bdev, block, size, gfp); -} -EXPORT_SYMBOL(__getblk_gfp); - /* * Do async read-ahead on a buffer.. */ @@ -1491,7 +1467,17 @@ struct buffer_head * __bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + struct buffer_head *bh; + + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + bh = bdev_getblk(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); -- cgit From 6ba924d341c24027d95352ae8802c9cd1c308559 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:05 +0100 Subject: buffer: use folio_end_read() There are two places that we can use this new helper. Link: https://lkml.kernel.org/r/20231004165317.1061855-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index edec8652788c..86ddfa52c699 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } while (tmp != bh); spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If all of the buffers are uptodate then we can set the page - * uptodate. - */ - if (folio_uptodate) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, folio_uptodate); return; still_busy: @@ -2433,12 +2427,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (!nr) { /* - * All buffers are uptodate - we can set the folio uptodate - * as well. But not if get_block() returned an error. + * All buffers are uptodate or get_block() returned an + * error when trying to map them - we can finish the read. */ - if (!page_error) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, !page_error); return 0; } -- cgit From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:49 +0100 Subject: buffer: make folio_create_empty_buffers() return a buffer_head Patch series "Finish the create_empty_buffers() transition", v2. Pankaj recently added folio_create_empty_buffers() as the folio equivalent to create_empty_buffers(). This patch set finishes the conversion by first converting all remaining filesystems to call folio_create_empty_buffers(), then renaming it back to create_empty_buffers(). I took the opportunity to make a few simplifications like making folio_create_empty_buffers() return the head buffer and extracting get_nth_bh() from nilfs2. A few of the patches in this series aren't directly related to create_empty_buffers(), but I saw them while I was working on this and thought they'd be easy enough to add to this series. Compile-tested only, other than ext4. This patch (of 26): Almost all callers want to know the first BH that was allocated for this folio. We already have that handy, so return it. Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 86ddfa52c699..bfa7604d1891 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,8 +1641,8 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state) +struct buffer_head *folio_create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; @@ -1669,6 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, } folio_attach_private(folio, head); spin_unlock(&folio->mapping->private_lock); + + return head; } EXPORT_SYMBOL(folio_create_empty_buffers); @@ -1770,13 +1772,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) { + struct buffer_head *bh; + BUG_ON(!folio_test_locked(folio)); - if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, - 1 << READ_ONCE(inode->i_blkbits), - b_state); - return folio_buffers(folio); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), b_state); + return bh; } /* @@ -2676,10 +2680,8 @@ int block_truncate_page(struct address_space *mapping, return PTR_ERR(folio); bh = folio_buffers(folio); - if (!bh) { - folio_create_empty_buffers(folio, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = folio_create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); -- cgit From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:14 +0100 Subject: buffer: remove folio_create_empty_buffers() With all users converted, remove the old create_empty_buffers() and rename folio_create_empty_buffers() to create_empty_buffers(). Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index bfa7604d1891..657a62bab73d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,7 +1641,7 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -struct buffer_head *folio_create_empty_buffers(struct folio *folio, +struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; @@ -1672,13 +1672,6 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio, return head; } -EXPORT_SYMBOL(folio_create_empty_buffers); - -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) -{ - folio_create_empty_buffers(page_folio(page), blocksize, b_state); -} EXPORT_SYMBOL(create_empty_buffers); /** @@ -1778,7 +1771,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, 1 << READ_ONCE(inode->i_blkbits), b_state); return bh; } @@ -2681,7 +2674,7 @@ int block_truncate_page(struct address_space *mapping, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, blocksize, 0); + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); -- cgit