diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 1520 |
1 files changed, 795 insertions, 725 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; + /* Last byte contained in bbio + 1 . */ + loff_t next_file_offset; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; @@ -221,22 +223,17 @@ static void __process_folios_contig(struct address_space *mapping, } static noinline void unlock_delalloc_folio(const struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_folio); - if (index == locked_folio->index && end_index == index) - return; __process_folios_contig(inode->i_mapping, locked_folio, start, end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -246,9 +243,6 @@ static noinline int lock_delalloc_folios(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_folio->index && index == end_index) - return 0; - folio_batch_init(&fbatch); while (index <= end_index) { unsigned int found_folios, i; @@ -340,7 +334,7 @@ again: /* @delalloc_end can be -1, never go beyond @orig_end */ *end = min(delalloc_end, orig_end); - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return false; } @@ -366,7 +360,7 @@ again: /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { max_bytes = PAGE_SIZE; @@ -379,13 +373,13 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, cached_state); + ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, cached_state); - unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); @@ -403,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, end, page_ops); @@ -425,14 +419,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -462,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Only order 0 (single page) folios are allowed for data. */ - ASSERT(folio_order(folio) == 0); - /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, @@ -488,11 +479,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -512,43 +503,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - u64 start; - u64 end; - u32 len; + u64 start = folio_pos(folio) + fi.offset; btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - /* - * We always issue full-sector reads, but if some block in a - * folio fails to read, blk_update_request() will advance - * bv_offset and adjust bv_len to compensate. Print a warning - * for unaligned offsets, and an error if they don't add up to - * a full sector. - */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page read in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page read with offset %zu and length %zu", - fi.offset, fi.length); - - start = folio_pos(folio) + fi.offset; - end = start + fi.length - 1; - len = fi.length; if (likely(uptodate)) { + u64 end = start + fi.length - 1; loff_t i_size = i_size_read(inode); /* @@ -573,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_folio_read(folio, uptodate, start, len); + end_folio_read(folio, uptodate, start, fi.length); } bio_put(bio); } @@ -664,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct folio *folio, u64 disk_bytenr, - unsigned int pg_offset) + u64 disk_bytenr, loff_t file_offset) { struct bio *bio = &bio_ctrl->bbio->bio; - struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -681,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, } /* - * The contig check requires the following conditions to be met: - * - * 1) The folios are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. + * To merge into a bio both the disk sector and the logical offset in + * the file need to be contiguous. */ - return bio_end_sector(bio) == sector && - folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == - folio_pos(folio) + pg_offset; + return bio_ctrl->next_file_offset == file_offset && + bio_end_sector(bio) == sector; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -711,6 +670,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->next_file_offset = file_offset; /* Limit data write bios to the ordered boundary. */ if (bio_ctrl->wbc) { @@ -752,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = folio_to_inode(folio); + loff_t file_offset = folio_pos(folio) + pg_offset; - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) submit_one_bio(bio_ctrl); do { u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bbio) { - alloc_new_bio(inode, bio_ctrl, disk_bytenr, - folio_pos(folio) + pg_offset); - } + if (!bio_ctrl->bbio) + alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { @@ -781,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, folio, - len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; pg_offset += len; disk_bytenr += len; + file_offset += len; /* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or @@ -836,7 +796,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -870,7 +830,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -887,8 +847,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -903,13 +863,13 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, if (*em_cached) { em = *em_cached; - if (extent_map_in_tree(em) && start >= em->start && - start < extent_map_end(em)) { + if (btrfs_extent_map_in_tree(em) && start >= em->start && + start < btrfs_extent_map_end(em)) { refcount_inc(&em->refs); return em; } - free_extent_map(em); + btrfs_free_extent_map(em); *em_cached = NULL; } @@ -935,16 +895,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -955,45 +911,49 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + BUG_ON(btrfs_extent_map_end(em) <= cur); BUG_ON(end < cur); - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else - disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = btrfs_extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1037,23 +997,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (prev_em_start) *prev_em_start = em->start; - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1064,15 +1019,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + btrfs_lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1083,11 +1213,11 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); /* * If btrfs_do_readpage() failed we will want to submit the assembled @@ -1105,7 +1235,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1118,12 +1248,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1157,9 +1287,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1184,14 +1315,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1264,7 +1395,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1272,8 +1403,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges. */ - unlock_extent(&inode->io_tree, found_start, - found_start + found_len - 1, NULL); + btrfs_unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); @@ -1309,7 +1440,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1324,7 +1455,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1333,7 +1464,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1379,19 +1510,19 @@ static int submit_one_sector(struct btrfs_inode *inode, return PTR_ERR(em); extent_offset = filepos - em->start; - em_end = extent_map_end(em); + em_end = btrfs_extent_map_end(em); ASSERT(filepos <= em_end); ASSERT(IS_ALIGNED(em->start, sectorsize)); ASSERT(IS_ALIGNED(em->len, sectorsize)); - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = btrfs_extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* @@ -1434,6 +1565,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1442,21 +1574,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1530,6 +1664,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1543,7 +1678,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl return 0; } - if (folio->index == end_index) + if (folio_contains(folio, end_index)) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); /* @@ -1551,6 +1686,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1562,14 +1721,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1615,8 +1774,18 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_unlock_irqrestore(&xas, flags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, @@ -1702,45 +1871,166 @@ static void set_btree_ioerr(struct extent_buffer *eb) } } +static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_tree, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!atomic_inc_not_zero(&eb->refs)) { + xas_reset(xas); + goto retry; + } + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + xas_reset(xas); + goto retry; + } + + return eb; +} + +static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_tree, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + goto out; + } + } + if (end == ULONG_MAX) + *start = ULONG_MAX; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return batch->nr; +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->sectorsize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } + buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -1792,199 +2082,56 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } /* - * Submit one subpage btree page. - * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. + * Wait for all eb writeback in the given range to finish. * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. - * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: The fs_info for this file system. + * @start: The offset of the range to start waiting on writeback. + * @end: The end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct eb_batch batch; + unsigned long start_index = (start >> fs_info->sectorsize_bits); + unsigned long end_index = (end >> fs_info->sectorsize_bits); + + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, + PAGECACHE_TAG_WRITEBACK, &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start++; - continue; - } - - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); - } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -1995,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = (wbc->range_start >> fs_info->sectorsize_bits); + end = (wbc->range_end >> fs_info->sectorsize_bits); + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -2023,31 +2172,39 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; + + if (ret) { + done = 1; + break; + } continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = (wbc->nr_to_write <= 0); + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { @@ -2192,10 +2349,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2233,7 +2388,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2314,8 +2469,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2325,13 +2480,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2390,15 +2550,15 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); submit_one_bio(&bio_ctrl); } @@ -2422,7 +2582,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, &cached_state); + btrfs_lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -2430,46 +2590,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent(tree, start, end, &cached_state); + btrfs_unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for release_folio, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. + * A helper for struct address_space_operations::release_folio, this tests for + * areas of the folio that are locked or under IO and drops the related state + * bits if it is safe to drop the folio. */ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { + struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; - bool ret; + u64 end = start + folio_size(folio) - 1; + u32 range_bits; + u32 clear_bits; + bool ret = false; + int ret2; - if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = false; - } else { - u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | - EXTENT_QGROUP_RESERVED); - int ret2; + btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); - /* - * At this point we can safely clear everything except the - * locked bit, the nodatasum bit and the delalloc new bit. - * The delalloc new bit will be cleared by ordered extent - * completion. - */ - ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + /* + * We can release the folio if it's locked only for ordered extent + * completion, since that doesn't require using the folio. + */ + if ((range_bits & EXTENT_LOCKED) && + !(range_bits & EXTENT_FINISHING_ORDERED)) + goto out; + + clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | + EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | + EXTENT_FINISHING_ORDERED); + /* + * At this point we can safely clear everything except the locked, + * nodatasum, delalloc new and finishing ordered bits. The delalloc new + * bit will be cleared by ordered extent completion. + */ + ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); + /* + * If clear_extent_bit failed for enomem reasons, we can't allow the + * release to continue. + */ + if (ret2 == 0) + ret = true; +out: + btrfs_free_extent_state(cached_state); - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret2 < 0) - ret = false; - else - ret = true; - } return ret; } @@ -2481,7 +2649,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2492,18 +2660,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) struct extent_map *em; write_lock(&extent_tree->lock); - em = lookup_extent_mapping(extent_tree, start, len); + em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) { write_unlock(&extent_tree->lock); break; } if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&extent_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); break; } - if (test_range_bit_exists(io_tree, em->start, - extent_map_end(em) - 1, EXTENT_LOCKED)) + if (btrfs_test_range_bit_exists(io_tree, em->start, + btrfs_extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast @@ -2530,15 +2699,15 @@ remove_em: * fsync performance for workloads with a data size that exceeds * or is close to the system's memory). */ - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); next: - start = extent_map_end(em); + start = btrfs_extent_map_end(em); write_unlock(&extent_tree->lock); /* Once for us, for the lookup_extent_mapping() reference. */ - free_extent_map(em); + btrfs_free_extent_map(em); if (need_resched()) { /* @@ -2577,6 +2746,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2584,21 +2754,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2608,7 +2777,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2618,7 +2787,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2629,9 +2798,9 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } /* Release all folios attached to the extent buffer */ @@ -2646,9 +2815,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); } } @@ -2662,15 +2828,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2679,18 +2844,36 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } +/* + * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() + * does not call folio_put(), and we need to set the folios to NULL so that + * btrfs_release_extent_buffer() will not detach them a second time. + */ +static void cleanup_extent_buffer_folios(struct extent_buffer *eb) +{ + const int num_folios = num_extent_folios(eb); + + /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + for (int i = 0; i < num_folios; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); + int num_folios; int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2702,78 +2885,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); ret = alloc_eb_folio_array(new, false); - if (ret) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret) + goto release_eb; + ASSERT(num_extent_folios(src) == num_extent_folios(new), + "%d != %d", num_extent_folios(src), num_extent_folios(new)); + /* Explicitly use the cached num_extent value from now on. */ + num_folios = num_extent_folios(src); for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); - if (ret < 0) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret < 0) + goto cleanup_folios; WARN_ON(folio_test_dirty(folio)); } + for (int i = 0; i < num_folios; i++) + folio_put(new->folios[i]); + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; + +cleanup_folios: + cleanup_extent_buffer_folios(new); +release_eb: + btrfs_release_extent_buffer(new); + return NULL; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto release_eb; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto cleanup_folios; } + for (int i = 0; i < num_extent_folios(eb); i++) + folio_put(eb->folios[i]); set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { - if (eb->folios[i]) { - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_put(eb->folios[i]); - } - } - kmem_cache_free(extent_buffer_cache, eb); - return NULL; -} -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); +cleanup_folios: + cleanup_extent_buffer_folios(eb); +release_eb: + btrfs_release_extent_buffer(eb); + return NULL; } static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2785,13 +2968,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -2805,11 +2987,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2842,10 +3022,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2857,32 +3037,34 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return exists; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); return eb; -free_eb: - btrfs_release_extent_buffer(eb); - return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2892,11 +3074,11 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2928,10 +3110,9 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return true; } - if (fs_info->nodesize < PAGE_SIZE && - offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %u", + "tree block is not nodesize aligned, start %llu nodesize %u", start, fs_info->nodesize); return true; } @@ -2967,7 +3148,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio = NULL; + struct folio *existing_folio; int ret; ASSERT(found_eb_ret); @@ -2976,6 +3157,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ASSERT(eb->folios[i]); retry: + existing_folio = NULL; ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) @@ -2983,10 +3165,8 @@ retry: existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) { - existing_folio = NULL; + if (IS_ERR(existing_folio)) goto retry; - } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -2999,7 +3179,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3027,7 +3207,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3041,8 +3221,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3070,7 +3248,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3090,8 +3268,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3106,9 +3284,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3137,7 +3314,7 @@ reallocate: * using 0-order folios. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate; } attached++; @@ -3148,7 +3325,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3159,15 +3336,14 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3176,34 +3352,42 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) { folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: @@ -3217,26 +3401,22 @@ out: * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * - * We have to drop our reference and NULL it out here because in the - * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. - * Below when we call btrfs_release_extent_buffer() we will call - * detach_extent_buffer_folio() on our remaining pages in the !subpage - * case. If we left eb->folios[i] populated in the subpage case we'd - * double put our reference and be super sad. + * Note: the bounds is num_extent_pages() as we need to go through all slots. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_unlock(eb->folios[i]); - folio_put(eb->folios[i]); + for (int i = 0; i < num_extent_pages(eb); i++) { + struct folio *folio = eb->folios[i]; + + if (i < attached) { + ASSERT(folio); + detach_extent_buffer_folio(eb, folio); + folio_unlock(folio); + } else if (!folio) { + continue; + } + + folio_put(folio); eb->folios[i] = NULL; } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); if (ret < 0) return ERR_PTR(ret); @@ -3259,18 +3439,27 @@ static int release_extent_buffer(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { - struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); - } else { - spin_unlock(&eb->refs_lock); - } + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -3333,38 +3522,21 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) - __xa_clear_mark(&folio->mapping->i_pages, - folio_index(folio), PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&folio->mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3388,20 +3560,20 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; + buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3409,37 +3581,35 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); + buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3447,54 +3617,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3507,10 +3654,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3525,25 +3669,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) uptodate = false; - if (uptodate) { + if (uptodate) set_extent_buffer_uptodate(eb); - } else { + else clear_extent_buffer_uptodate(eb); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - } - - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3555,7 +3684,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3583,7 +3711,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); atomic_inc(&eb->refs); @@ -3595,19 +3722,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3634,7 +3756,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return true; } @@ -3796,7 +3918,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4170,71 +4292,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. @@ -4242,10 +4310,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4263,7 +4330,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. @@ -4282,7 +4352,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* |