diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 8 | ||||
-rw-r--r-- | block/Makefile | 5 | ||||
-rw-r--r-- | block/bdev.c | 3 | ||||
-rw-r--r-- | block/bfq-iosched.c | 6 | ||||
-rw-r--r-- | block/bio-integrity-auto.c | 62 | ||||
-rw-r--r-- | block/bio-integrity.c | 4 | ||||
-rw-r--r-- | block/bio.c | 160 | ||||
-rw-r--r-- | block/blk-cgroup.c | 10 | ||||
-rw-r--r-- | block/blk-core.c | 2 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 1 | ||||
-rw-r--r-- | block/blk-map.c | 93 | ||||
-rw-r--r-- | block/blk-merge.c | 137 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 13 | ||||
-rw-r--r-- | block/blk-mq-dma.c | 116 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 53 | ||||
-rw-r--r-- | block/blk-mq.c | 309 | ||||
-rw-r--r-- | block/blk-mq.h | 7 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 4 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 21 | ||||
-rw-r--r-- | block/blk-settings.c | 5 | ||||
-rw-r--r-- | block/blk-sysfs.c | 34 | ||||
-rw-r--r-- | block/blk-throttle.c | 411 | ||||
-rw-r--r-- | block/blk-throttle.h | 36 | ||||
-rw-r--r-- | block/blk-wbt.c | 11 | ||||
-rw-r--r-- | block/blk.h | 53 | ||||
-rw-r--r-- | block/bounce.c | 267 | ||||
-rw-r--r-- | block/elevator.c | 329 | ||||
-rw-r--r-- | block/elevator.h | 6 | ||||
-rw-r--r-- | block/fops.c | 28 | ||||
-rw-r--r-- | block/genhd.c | 266 | ||||
-rw-r--r-- | block/ioprio.c | 6 | ||||
-rw-r--r-- | block/mq-deadline.c | 2 |
32 files changed, 1250 insertions, 1218 deletions
diff --git a/block/Kconfig b/block/Kconfig index df8973bc0539..15027963472d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK source "block/partitions/Kconfig" -config BLK_MQ_PCI - def_bool PCI - -config BLK_MQ_VIRTIO - bool - depends on VIRTIO - default y - config BLK_PM def_bool PM diff --git a/block/Makefile b/block/Makefile index 3a941dc0d27f..c65f4da93702 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,13 +5,12 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-merge.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \ + blk-mq-tag.o blk-mq-dma.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ disk-events.o blk-ia-ranges.o early-lookup.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/bdev.c b/block/bdev.c index 889ec6e002d7..b77ddd12dc06 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1335,7 +1335,8 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) generic_fill_statx_atomic_writes(stat, queue_atomic_write_unit_min_bytes(bd_queue), - queue_atomic_write_unit_max_bytes(bd_queue)); + queue_atomic_write_unit_max_bytes(bd_queue), + 0); } stat->blksize = bdev_io_min(bdev); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index abd80dc13562..0cb1e9873aab 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7210,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); - clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); - wbt_enable_default(bfqd->queue->disk); + blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue); + set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags); kfree(bfqd); } @@ -7397,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); - set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); + blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index e524c609be50..9c6657664792 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -9,6 +9,7 @@ * not aware of PI. */ #include <linux/blk-integrity.h> +#include <linux/t10-pi.h> #include <linux/workqueue.h> #include "blk.h" @@ -43,6 +44,29 @@ static void bio_integrity_verify_fn(struct work_struct *work) bio_endio(bio); } +#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) +static bool bip_should_check(struct bio_integrity_payload *bip) +{ + return bip->bip_flags & BIP_CHECK_FLAGS; +} + +static bool bi_offload_capable(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return bi->tuple_size == sizeof(struct crc64_pi_tuple); + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return bi->tuple_size == sizeof(struct t10_pi_tuple); + default: + pr_warn_once("%s: unknown integrity checksum type:%d\n", + __func__, bi->csum_type); + fallthrough; + case BLK_INTEGRITY_CSUM_NONE: + return false; + } +} + /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio @@ -54,12 +78,12 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_data *bid = container_of(bip, struct bio_integrity_data, bip); - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && + bip_should_check(bip)) { INIT_WORK(&bid->work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bid->work); return false; @@ -84,6 +108,7 @@ bool bio_integrity_prep(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; + bool set_flags = true; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; @@ -100,19 +125,24 @@ bool bio_integrity_prep(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; + if (bi->flags & BLK_INTEGRITY_NOVERIFY) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + } break; case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk for non-integrity metadata where nothing else * initializes the memory. */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + if (bi->flags & BLK_INTEGRITY_NOGENERATE) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + gfp |= __GFP_ZERO; + } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; break; default: @@ -137,19 +167,21 @@ bool bio_integrity_prep(struct bio *bio) bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bid->bip.bip_flags |= BIP_IP_CHECKSUM; - if (bi->csum_type) - bid->bip.bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bid->bip.bip_flags |= BIP_CHECK_REFTAG; + if (set_flags) { + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bid->bip.bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bid->bip.bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bid->bip.bip_flags |= BIP_CHECK_REFTAG; + } if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) goto err_end_io; /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) + if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 43ef6bd06c85..cb94e9be26dc 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -127,10 +127,8 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; - bool same_page = false; - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - &same_page)) { + if (bvec_try_merge_hw_page(q, bv, page, len, offset)) { bip->bip_iter.bi_size += len; return len; } diff --git a/block/bio.c b/block/bio.c index 4e6c85a33d74..3c0a558c90f5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; + bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -611,7 +612,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_vecs > UIO_MAXIOV) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } @@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { @@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, - unsigned int len, unsigned int off, bool *same_page) + unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return false; - *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & - PAGE_MASK)); - if (!*same_page) { + if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) @@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page) + struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); @@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; - return bvec_try_merge_page(bv, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset); } /** @@ -990,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(__bio_add_page); /** + * bio_add_virt_nofail - add data in the direct kernel mapping to a bio + * @bio: destination bio + * @vaddr: data to add + * @len: length of the data to add, may cross pages + * + * Add the data at @vaddr to @bio. The caller must have ensure a segment + * is available for the added data. No merging into an existing segment + * will be performed. + */ +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) +{ + __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); +} +EXPORT_SYMBOL_GPL(bio_add_virt_nofail); + +/** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add @@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page); int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - bool same_page = false; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) @@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { + page, len, offset)) { bio->bi_iter.bi_size += len; return len; } @@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, } EXPORT_SYMBOL(bio_add_folio); +/** + * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio and return how many bytes were added. + * This may be less than the amount originally asked. Returns 0 if no data + * could be added to @bio. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) +{ + unsigned int offset = offset_in_page(vaddr); + + len = min(len, PAGE_SIZE - offset); + if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) + return 0; + if (op_is_write(bio_op(bio))) + flush_kernel_vmap_range(vaddr, len); + return len; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); + +/** + * bio_add_vmalloc - add a vmalloc region to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio. Return %true on success or %false if + * @bio does not have enough space for the payload. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) +{ + do { + unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); + + if (!added) + return false; + vaddr += added; + len -= added; + } while (len); + + return true; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc); + void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; @@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, - size_t offset) -{ - bool same_page = false; - - if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) - return -EIO; - - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - folio_page(folio, 0), len, offset, - &same_page)) { - bio->bi_iter.bi_size += len; - if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_folio(folio, 1); - return 0; - } - bio_add_folio_nofail(bio, folio, len, offset); - return 0; -} - static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, @@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); + unsigned int old_vcnt = bio->bi_vcnt; folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; @@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); - bio_iov_add_folio(bio, folio, len, folio_offset); + if (!bio_add_folio(bio, folio, len, folio_offset)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + /* + * We're adding another fragment of a page that already + * was part of the last segment. Undo our pin as the + * page was pinned when an earlier fragment of it was + * added to the bio and __bio_release_pages expects a + * single pin per page. + */ + if (offset && bio->bi_vcnt == old_vcnt) + unpin_user_folio(folio, 1); + } offset = 0; } @@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +/** + * bdev_rw_virt - synchronously read into / write from kernel mapping + * @bdev: block device to access + * @sector: sector to access + * @data: data to read/write + * @len: length in byte to read/write + * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) + * + * Performs synchronous I/O to @bdev for @data/@len. @data must be in + * the kernel direct mapping and not a vmalloc address. + */ +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op) +{ + struct bio_vec bv; + struct bio bio; + int error; + + if (WARN_ON_ONCE(is_vmalloc_addr(data))) + return -EIO; + + bio_init(&bio, bdev, &bv, 1, op); + bio.bi_iter.bi_sector = sector; + bio_add_virt_nofail(&bio, data, len); + error = submit_bio_wait(&bio); + bio_uninit(&bio); + return error; +} +EXPORT_SYMBOL_GPL(bdev_rw_virt); + static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ce93706555c5..5936db7f8475 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1074,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) /* * For covering concurrent parent blkg update from blkg_release(). * - * When flushing from cgroup, cgroup_rstat_lock is always held, so - * this lock won't cause contention most of time. + * When flushing from cgroup, the subsystem rstat lock is always held, + * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); @@ -1144,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1253,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -2243,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/block/blk-core.c b/block/blk-core.c index e8cc270a453f..b862c66018f2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1018,7 +1018,7 @@ again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && - (end || part_in_flight(part))) + (end || bdev_count_inflight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index f154be0b575a..005c9157ffb3 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-map.c b/block/blk-map.c index d2f22744b3d1..23e5d5ebe59e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio) kfree(bio); } -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; + unsigned int nr_vecs = bio_add_max_vecs(data, len); struct bio *bio; - bio = bio_kmalloc(nr_pages, gfp_mask); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + if (is_vmalloc_addr(data)) { bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_page(bio, page, bytes, offset) < bytes) { - /* we don't support partial mappings */ + if (!bio_add_vmalloc(bio, data, len)) { bio_uninit(bio); kfree(bio); return ERR_PTR(-EINVAL); } - - data += bytes; - len -= bytes; - offset = 0; + } else { + bio_add_virt_nofail(bio, data, len); } - bio->bi_end_io = bio_map_kern_endio; return bio; } @@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio) /** * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes + * @op: bio/request operation * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ * * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *bio_copy_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask, int reading) +static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); while (len) { struct page *page; @@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (!page) goto cleanup; - if (!reading) + if (op_is_write(op)) memcpy(page_address(page), p, bytes); if (bio_add_page(bio, page, bytes, 0) < bytes) @@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, p += bytes; } - if (reading) { + if (op_is_write(op)) { + bio->bi_end_io = bio_copy_kern_endio; + } else { bio->bi_end_io = bio_copy_kern_endio_read; bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; } return bio; @@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (map_data) copy = true; - else if (blk_queue_may_bounce(q)) - copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (iov_iter_is_bvec(iter)) @@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests - * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data @@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * buffer is used. Can be called multiple times to append multiple * buffers. */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp_mask) { - int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; - if (len > (queue_max_hw_sectors(q) << 9)) + if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT)) return -EINVAL; if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || - blk_queue_may_bounce(q)) - bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf)) + bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask); else - bio = bio_map_kern(q, kbuf, len, gfp_mask); + bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); - ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { bio_uninit(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index fdd4efb54c6c..3af1d284add5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,7 +7,6 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> -#include <linux/scatterlist.h> #include <linux/part_stat.h> #include <linux/blk-cgroup.h> @@ -226,27 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio, } /** - * get_max_segment_size() - maximum number of bytes to add as a single segment - * @lim: Request queue limits. - * @paddr: address of the range to add - * @len: maximum length available to add at @paddr - * - * Returns the maximum number of bytes of the range starting at @paddr that can - * be added to a single segment. - */ -static inline unsigned get_max_segment_size(const struct queue_limits *lim, - phys_addr_t paddr, unsigned int len) -{ - /* - * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 - * after having calculated the minimum. - */ - return min_t(unsigned long, len, - min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), - (unsigned long)lim->max_segment_size - 1) + 1); -} - -/** * bvec_split_segs - verify whether or not a bvec should be split in the middle * @lim: [in] queue limits to split based on * @bv: [in] bvec to examine @@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } -struct phys_vec { - phys_addr_t paddr; - u32 len; -}; - -static bool blk_map_iter_next(struct request *req, - struct req_iterator *iter, struct phys_vec *vec) -{ - unsigned int max_size; - struct bio_vec bv; - - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - - if (!iter->iter.bi_size) - return false; - - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); - vec->paddr = bvec_phys(&bv); - max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); - bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); - - /* - * If we are entirely done with this bi_io_vec entry, check if the next - * one could be merged into it. This typically happens when moving to - * the next bio, but some callers also don't pack bvecs tight. - */ - while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { - struct bio_vec next; - - if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - } - - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); - if (bv.bv_len + next.bv_len > max_size || - !biovec_phys_mergeable(req->q, &bv, &next)) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); - } - - vec->len = bv.bv_len; - return true; -} - -static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, - struct scatterlist *sglist) -{ - if (!*sg) - return sglist; - - /* - * If the driver previously mapped a shorter list, we could see a - * termination bit prematurely unless it fully inits the sg table - * on each mapping. We KNOW that there must be more entries here - * or the driver would be buggy, so force clear the termination bit - * to avoid doing a full sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - return sg_next(*sg); -} - -/* - * Map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries. - */ -int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, - struct scatterlist **last_sg) -{ - struct req_iterator iter = { - .bio = rq->bio, - }; - struct phys_vec vec; - int nsegs = 0; - - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - - while (blk_map_iter_next(rq, &iter, &vec)) { - *last_sg = blk_next_sg(last_sg, sglist); - sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, - offset_in_page(vec.paddr)); - nsegs++; - } - - if (*last_sg) - sg_mark_end(*last_sg); - - /* - * Something must have been wrong if the figured number of - * segment is bigger than number of req's physical segments - */ - WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); - - return nsegs; -} -EXPORT_SYMBOL(__blk_rq_map_sg); - static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { @@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q, if (req->bio->bi_write_hint != next->bio->bi_write_hint) return NULL; + if (req->bio->bi_write_stream != next->bio->bi_write_stream) + return NULL; if (req->bio->bi_ioprio != next->bio->bi_ioprio) return NULL; if (!blk_atomic_write_mergeable_rqs(req, next)) @@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; if (rq->bio->bi_write_hint != bio->bi_write_hint) return false; + if (rq->bio->bi_write_stream != bio->bi_write_stream) + return false; if (rq->bio->bi_ioprio != bio->bi_ioprio) return false; if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3421b5521fe2..29b3540dd180 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -93,6 +93,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(DISABLE_WBT_DEF), + QUEUE_FLAG_NAME(NO_ELV_SWITCH), }; #undef QUEUE_FLAG_NAME @@ -624,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q) debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); - /* - * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir - * didn't exist yet (because we don't know what to name the directory - * until the queue is registered to a gendisk). - */ - if (q->elevator && !q->sched_debugfs_dir) - blk_mq_debugfs_register_sched(q); - - /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); - if (q->elevator && !hctx->sched_debugfs_dir) - blk_mq_debugfs_register_sched_hctx(q, hctx); } if (q->rq_qos) { diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c new file mode 100644 index 000000000000..82bae475dfa4 --- /dev/null +++ b/block/blk-mq-dma.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Christoph Hellwig + */ +#include "blk.h" + +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; + +static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, + struct phys_vec *vec) +{ + unsigned int max_size; + struct bio_vec bv; + + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; + } + + if (!iter->iter.bi_size) + return false; + + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } + + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + + vec->len = bv.bv_len; + return true; +} + +static inline struct scatterlist * +blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +/* + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. + */ +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) +{ + struct req_iterator iter = { + .bio = rq->bio, + }; + struct phys_vec vec; + int nsegs = 0; + + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } + + if (*last_sg) + sg_mark_end(*last_sg); + + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); + + return nsegs; +} +EXPORT_SYMBOL(__blk_rq_map_sg); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 109611445d40..55a0fd105147 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); - unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } - count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: - return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return blk_mq_dispatch_rq_list(hctx, &hctx_list, false); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { - dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false); } if (busy) @@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true)) return 0; need_dispatch = true; } else { @@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + blk_mq_dispatch_rq_list(hctx, &rq_list, true); return 0; } @@ -436,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) return 0; } +void blk_mq_sched_reg_debugfs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched(q); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); +} + +void blk_mq_sched_unreg_debugfs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_unregister_sched_hctx(hctx); + blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); +} + /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { @@ -469,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched(q); - mutex_unlock(&q->debugfs_mutex); - queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); @@ -484,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); } - return 0; err_free_map_and_rqs: @@ -527,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched_hctx(hctx); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; @@ -538,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) flags = hctx->flags; } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched(q); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); + set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags); q->elevator = NULL; } diff --git a/block/blk-mq.c b/block/blk-mq.c index c2697db59109..4806b867e37d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct request *rq, void *priv) +static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; @@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv) return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part) +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - - return mi.inflight[0] + mi.inflight[1]; -} - -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]) -{ - struct mq_inflight mi = { .part = part }; - - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - inflight[0] = mi.inflight[0]; - inflight[1] = mi.inflight[1]; + blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, + &mi); + inflight[READ] = mi.inflight[READ]; + inflight[WRITE] = mi.inflight[WRITE]; } #ifdef CONFIG_LOCKDEP @@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; int ret; @@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; @@ -2080,7 +2084,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, - unsigned int nr_budgets) + bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; @@ -2102,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; @@ -2111,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bd.rq = rq; bd.last = list_empty(list); - /* - * once the request is queued to lld, no need to cover the - * budget any more - */ - if (nr_budgets) - nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: @@ -2150,7 +2148,11 @@ out: ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); - if (nr_budgets) + /* + * If the caller allocated budgets, free the budgets of the + * requests that have not yet been passed to the block driver. + */ + if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); @@ -2778,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_issue_directly(hctx, rq, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug) +static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(&plug->mq_list); + while ((rq = rq_list_pop(rqs))) { + bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2817,15 +2819,64 @@ out: blk_mq_commit_rqs(hctx, queued, false); } -static void __blk_mq_flush_plug_list(struct request_queue *q, - struct blk_plug *plug) +static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; - q->mq_ops->queue_rqs(&plug->mq_list); + q->mq_ops->queue_rqs(rqs); +} + +static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, + struct rq_list *queue_rqs) +{ + struct request *rq = rq_list_pop(rqs); + struct request_queue *this_q = rq->q; + struct request **prev = &rqs->head; + struct rq_list matched_rqs = {}; + struct request *last = NULL; + unsigned depth = 1; + + rq_list_add_tail(&matched_rqs, rq); + while ((rq = *prev)) { + if (rq->q == this_q) { + /* move rq from rqs to matched_rqs */ + *prev = rq->rq_next; + rq_list_add_tail(&matched_rqs, rq); + depth++; + } else { + /* leave rq in rqs */ + prev = &rq->rq_next; + last = rq; + } + } + + rqs->tail = last; + *queue_rqs = matched_rqs; + return depth; +} + +static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) +{ + struct request_queue *q = rq_list_peek(rqs)->q; + + trace_block_unplug(q, depth, true); + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole list in one go. + * We already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + */ + if (q->mq_ops->queue_rqs) { + blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); + if (rq_list_empty(rqs)) + return; + } + + blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } -static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; @@ -2835,7 +2886,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) LIST_HEAD(list); do { - struct request *rq = rq_list_pop(&plug->mq_list); + struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; @@ -2848,9 +2899,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) } list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(&plug->mq_list)); + } while (!rq_list_empty(rqs)); - plug->mq_list = requeue_list; + *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); @@ -2870,9 +2921,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_put(&this_hctx->queue->q_usage_counter); } +static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) +{ + do { + struct rq_list queue_rqs; + unsigned depth; + + depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); + blk_mq_dispatch_queue_requests(&queue_rqs, depth); + while (!rq_list_empty(&queue_rqs)) + blk_mq_dispatch_list(&queue_rqs, false); + } while (!rq_list_empty(rqs)); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request *rq; unsigned int depth; /* @@ -2887,34 +2950,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) depth = plug->rq_count; plug->rq_count = 0; - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - struct request_queue *q; - - rq = rq_list_peek(&plug->mq_list); - q = rq->q; - trace_block_unplug(q, depth, true); - - /* - * Peek first request and see if we have a ->queue_rqs() hook. - * If we do, we can dispatch the whole plug list in one go. We - * already know at this point that all requests belong to the - * same queue, caller must ensure that's the case. - */ - if (q->mq_ops->queue_rqs) { - blk_mq_run_dispatch_ops(q, - __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(&plug->mq_list)) - return; + if (!plug->has_elevator && !from_schedule) { + if (plug->multiple_queues) { + blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); + return; } - blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug)); + blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return; } do { - blk_mq_dispatch_plug_list(plug, from_schedule); + blk_mq_dispatch_list(&plug->mq_list, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } @@ -2969,8 +3017,14 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, { struct blk_mq_alloc_data data = { .q = q, - .nr_tags = 1, + .flags = 0, + .shallow_depth = 0, .cmd_flags = bio->bi_opf, + .rq_flags = 0, + .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -3080,8 +3134,6 @@ void blk_mq_submit_bio(struct bio *bio) goto new_request; } - bio = blk_queue_bounce(bio, q); - /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. @@ -4094,8 +4146,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; - mutex_lock(&q->elevator_lock); - queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4200,8 +4250,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } - - mutex_unlock(&q->elevator_lock); } /* @@ -4505,16 +4553,9 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q, bool lock) + struct request_queue *q) { - if (lock) { - /* protect against switching io scheduler */ - mutex_lock(&q->elevator_lock); - __blk_mq_realloc_hw_ctxs(set, q); - mutex_unlock(&q->elevator_lock); - } else { - __blk_mq_realloc_hw_ctxs(set, q); - } + __blk_mq_realloc_hw_ctxs(set, q); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4546,7 +4587,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q, false); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4563,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); - blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); + blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: @@ -4784,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } + init_rwsem(&set->update_nr_hwq_lock); + ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, @@ -4923,88 +4966,10 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } -/* - * request_queue and elevator_type pair. - * It is just used by __blk_mq_update_nr_hw_queues to cache - * the elevator_type associated with a request_queue. - */ -struct blk_mq_qe_pair { - struct list_head node; - struct request_queue *q; - struct elevator_type *type; -}; - -/* - * Cache the elevator_type in qe pair list and switch the - * io scheduler to 'none' - */ -static bool blk_mq_elv_switch_none(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); - if (!qe) - return false; - - /* Accessing q->elevator needs protection from ->elevator_lock. */ - mutex_lock(&q->elevator_lock); - - if (!q->elevator) { - kfree(qe); - goto unlock; - } - - INIT_LIST_HEAD(&qe->node); - qe->q = q; - qe->type = q->elevator->type; - /* keep a reference to the elevator module as we'll switch back */ - __elevator_get(qe->type); - list_add(&qe->node, head); - elevator_disable(q); -unlock: - mutex_unlock(&q->elevator_lock); - - return true; -} - -static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - - list_for_each_entry(qe, head, node) - if (qe->q == q) - return qe; - - return NULL; -} - -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - struct elevator_type *t; - - qe = blk_lookup_qe_pair(head, q); - if (!qe) - return; - t = qe->type; - list_del(&qe->node); - kfree(qe); - - mutex_lock(&q->elevator_lock); - elevator_switch(q, t); - /* drop the reference acquired in blk_mq_elv_switch_none */ - elevator_put(t); - mutex_unlock(&q->elevator_lock); -} - static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; - LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; @@ -5019,30 +4984,24 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_freeze_queue_nomemsave(q); - - /* - * Switch IO scheduler to 'none', cleaning up the data associated - * with the previous scheduler. We will switch back once we are done - * updating the new sw to hw queue mappings. - */ - list_for_each_entry(q, &set->tag_list, tag_set_list) - if (!blk_mq_elv_switch_none(&head, q)) - goto switch_back; - list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue_nomemsave(q); + + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue_nomemrestore(q); goto reregister; + } fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q, true); + __blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -5058,18 +5017,18 @@ fallback: blk_mq_map_swqueue(q); } + /* elv_update_nr_hw_queues() unfreeze queue for us */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + elv_update_nr_hw_queues(q); + reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); - } - -switch_back: - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue_nomemrestore(q); + blk_mq_remove_hw_queues_cpuhp(q); + blk_mq_add_hw_queues_cpuhp(q); + } memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ @@ -5079,9 +5038,11 @@ switch_back: void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { + down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 3011a78cf16a..affb2e14b56e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, - unsigned int); + bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -246,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]); +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 95982bc46ba1..848591fb3c57 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -2,6 +2,8 @@ #include "blk-rq-qos.h" +__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); + /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -317,6 +319,7 @@ void rq_qos_exit(struct request_queue *q) struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); + static_branch_dec(&block_rq_qos); } mutex_unlock(&q->rq_qos_mutex); } @@ -343,6 +346,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; + static_branch_inc(&block_rq_qos); blk_mq_unfreeze_queue(q, memflags); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 37245c97ee61..39749f4066fb 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -12,6 +12,7 @@ #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; +extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, @@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos && !blk_rq_is_passthrough(rq)) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && + !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { - if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + if (static_branch_unlikely(&block_rq_qos) && + bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) @@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio) static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 4817e7ca03f8..a000daafbfb4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -124,11 +124,6 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } - if (lim->features & BLK_FEAT_BOUNCE_HIGH) { - pr_warn("no bounce buffer support for integrity metadata\n"); - return -EINVAL; - } - if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1f9b45b0b9ee..b2b9b89d6967 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -134,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) +QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) @@ -488,6 +490,8 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); +QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); @@ -560,7 +564,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->elevator_lock); + mutex_lock(&disk->rqos_state_mutex); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -573,7 +577,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->elevator_lock); + mutex_unlock(&disk->rqos_state_mutex); return ret; } @@ -593,7 +597,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, return -EINVAL; memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -618,11 +621,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, */ blk_mq_quiesce_queue(q); + mutex_lock(&disk->rqos_state_mutex); wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); blk_mq_unquiesce_queue(q); out: - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; @@ -642,6 +646,8 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, + &queue_max_write_streams_entry.attr, + &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -869,16 +875,9 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_unregister_ia_ranges; - mutex_lock(&q->elevator_lock); - if (q->elevator) { - ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->elevator_lock); - goto out_crypto_sysfs_unregister; - } - } + if (queue_is_mq(q)) + elevator_set_default(q); wbt_enable_default(disk); - mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); @@ -902,8 +901,6 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_crypto_sysfs_unregister: - blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -951,10 +948,6 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->elevator_lock); - elv_unregister_queue(q); - mutex_unlock(&q->elevator_lock); - mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); @@ -963,5 +956,8 @@ void blk_unregister_queue(struct gendisk *disk) kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); + if (queue_is_mq(q)) + elevator_set_none(q); + blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d6dd2e047874..bd15357f23bd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio) static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); - bio_list_init(&qn->bios); + bio_list_init(&qn->bios_bps); + bio_list_init(&qn->bios_iops); qn->tg = tg; } @@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to - * @queued: the service_queue->queued[] list @qn belongs to + * @sq: the service_queue @qn belongs to * - * Add @bio to @qn and put @qn on @queued if it's not already on. + * Add @bio to @qn and put @qn on @sq->queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, - struct list_head *queued) + struct throtl_service_queue *sq) { - bio_list_add(&qn->bios, bio); + bool rw = bio_data_dir(bio); + + /* + * Split bios have already been throttled by bps, so they are + * directly queued into the iops path. + */ + if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || + bio_flagged(bio, BIO_BPS_THROTTLED)) { + bio_list_add(&qn->bios_iops, bio); + sq->nr_queued_iops[rw]++; + } else { + bio_list_add(&qn->bios_bps, bio); + sq->nr_queued_bps[rw]++; + } + if (list_empty(&qn->node)) { - list_add_tail(&qn->node, queued); + list_add_tail(&qn->node, &sq->queued[rw]); blkg_get(tg_to_blkg(qn->tg)); } } @@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek + * + * Always take a bio from the head of the iops queue first. If the queue is + * empty, we then take it from the bps queue to maintain the overall idea of + * fetching bios from the head. */ static struct bio *throtl_peek_queued(struct list_head *queued) { @@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_peek(&qn->bios); + bio = bio_list_peek(&qn->bios_iops); + if (!bio) + bio = bio_list_peek(&qn->bios_bps); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list - * @queued: the qnode list to pop a bio from + * @sq: the service_queue to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put + * @rw: read/write * - * Pop the first bio from the qnode list @queued. After popping, the first - * qnode is removed from @queued if empty or moved to the end of @queued so - * that the popping order is round-robin. + * Pop the first bio from the qnode list @sq->queued. Note that we firstly + * focus on the iops list because bios are ultimately dispatched from it. + * After popping, the first qnode is removed from @sq->queued if empty or moved + * to the end of @sq->queued so that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ -static struct bio *throtl_pop_queued(struct list_head *queued, - struct throtl_grp **tg_to_put) +static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, + struct throtl_grp **tg_to_put, bool rw) { + struct list_head *queued = &sq->queued[rw]; struct throtl_qnode *qn; struct bio *bio; @@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued, return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_pop(&qn->bios); + bio = bio_list_pop(&qn->bios_iops); + if (bio) { + sq->nr_queued_iops[rw]--; + } else { + bio = bio_list_pop(&qn->bios_bps); + if (bio) + sq->nr_queued_bps[rw]--; + } WARN_ON_ONCE(!bio); - if (bio_list_empty(&qn->bios)) { + if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; @@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { + if (!time_before(tg->slice_end[rw], jiffy_end)) + return; + throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", @@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } +static unsigned int sq_queued(struct throtl_service_queue *sq, int type) +{ + return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; +} + static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { @@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } +static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u64 bps_limit = tg_bps_limit(tg, rw); + long long bytes_trim; + + if (bps_limit == U64_MAX) + return 0; + + /* Need to consider the case of bytes_allowed overflow. */ + bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); + if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { + bytes_trim = tg->bytes_disp[rw]; + tg->bytes_disp[rw] = 0; + } else { + tg->bytes_disp[rw] -= bytes_trim; + } + + return bytes_trim; +} + +static int throtl_trim_iops(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u32 iops_limit = tg_iops_limit(tg, rw); + int io_trim; + + if (iops_limit == UINT_MAX) + return 0; + + /* Need to consider the case of io_allowed overflow. */ + io_trim = calculate_io_allowed(iops_limit, time_elapsed); + if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { + io_trim = tg->io_disp[rw]; + tg->io_disp[rw] = 0; + } else { + tg->io_disp[rw] -= io_trim; + } + + return io_trim; +} + /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { @@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * one extra slice is preserved for deviation. */ time_elapsed -= tg->td->throtl_slice; - bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed); - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); - if (bytes_trim <= 0 && io_trim <= 0) + bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); + io_trim = throtl_trim_iops(tg, rw, time_elapsed); + if (!bytes_trim && !io_trim) return; - if ((long long)tg->bytes_disp[rw] >= bytes_trim) - tg->bytes_disp[rw] -= bytes_trim; - else - tg->bytes_disp[rw] = 0; - - if ((int)tg->io_disp[rw] >= io_trim) - tg->io_disp[rw] -= io_trim; - else - tg->io_disp[rw] = 0; - tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, @@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw, unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); + long long bytes_allowed; + int io_allowed; + + /* + * If the queue is empty, carryover handling is not needed. In such cases, + * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch + * of subsequent bios. The same handling applies when the previous BPS/IOPS + * limit was set to max. + */ + if (sq_queued(&tg->service_queue, rw) == 0) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + return; + } /* * If config is updated while bios are still throttled, calculate and - * accumulate how many bytes/ios are waited across changes. And - * carryover_bytes/ios will be used to calculate new wait time under new - * configuration. + * accumulate how many bytes/ios are waited across changes. And use the + * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which + * will be used to calculate new wait time under new configuration. + * And we need to consider the case of bytes/io_allowed overflow. */ - if (bps_limit != U64_MAX) - *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - - tg->bytes_disp[rw]; - if (iops_limit != UINT_MAX) - *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - - tg->io_disp[rw]; - tg->bytes_disp[rw] -= *bytes; - tg->io_disp[rw] -= *ios; + if (bps_limit != U64_MAX) { + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); + if (bytes_allowed > 0) + *bytes = bytes_allowed - tg->bytes_disp[rw]; + } + if (iops_limit != UINT_MAX) { + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); + if (io_allowed > 0) + *ios = io_allowed - tg->io_disp[rw]; + } + + tg->bytes_disp[rw] = -*bytes; + tg->io_disp[rw] = -*ios; } static void tg_update_carryover(struct throtl_grp *tg) @@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg) long long bytes[2] = {0}; int ios[2] = {0}; - if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); - if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); - /* see comments in struct throtl_grp for meaning of these fields. */ + /* see comments in struct throtl_grp for meaning of carryover. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } @@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - if (iops_limit == UINT_MAX) { - return 0; - } - jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ @@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); - /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - return 0; - } - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ @@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); - if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + /* Need to consider the case of bytes_allowed overflow. */ + if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + || bytes_allowed < 0) return 0; /* Calc approx time to dispatch */ @@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, return jiffy_wait; } +static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) +{ + unsigned int bio_size = throtl_bio_data_size(bio); + + /* Charge the bio to the group */ + if (!bio_flagged(bio, BIO_BPS_THROTTLED) && + !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { + bio_set_flag(bio, BIO_TG_BPS_THROTTLED); + tg->bytes_disp[bio_data_dir(bio)] += bio_size; + } +} + +static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) +{ + bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); + tg->io_disp[bio_data_dir(bio)]++; +} + /* - * Returns whether one can dispatch a bio or not. Also returns approx number - * of jiffies to wait before this bio is with-in IO rate and can be dispatched + * If previous slice expired, start a new one otherwise renew/extend existing + * slice to make sure it is at least throtl_slice interval long since now. New + * slice is started only for empty throttle group. If there is queued bio, that + * means there should be an active slice and it should be extended instead. */ -static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) +static void tg_update_slice(struct throtl_grp *tg, bool rw) +{ + if (throtl_slice_used(tg, rw) && + sq_queued(&tg->service_queue, rw) == 0) + throtl_start_new_slice(tg, rw, true); + else + throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); +} + +static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); + unsigned long bps_wait; + + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || + bio_flagged(bio, BIO_BPS_THROTTLED) || + bio_flagged(bio, BIO_TG_BPS_THROTTLED)) + return 0; + + tg_update_slice(tg, rw); + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + throtl_extend_slice(tg, rw, jiffies + bps_wait); + + return bps_wait; +} + +static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); u32 iops_limit = tg_iops_limit(tg, rw); + unsigned long iops_wait; + + if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) + return 0; + + tg_update_slice(tg, rw); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + throtl_extend_slice(tg, rw, jiffies + iops_wait); + + return iops_wait; +} + +/* + * Returns approx number of jiffies to wait before this bio is with-in IO rate + * and can be moved to other queue or dispatched. + */ +static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + unsigned long wait; /* * Currently whole state machine of group depends on first bio @@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * this function with a different bio if there are other bios * queued. */ - BUG_ON(tg->service_queue.nr_queued[rw] && + BUG_ON(sq_queued(&tg->service_queue, rw) && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); - /* If tg->bps = -1, then BW is unlimited */ - if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || - tg->flags & THROTL_TG_CANCELING) { - if (wait) - *wait = 0; - return true; - } + wait = tg_dispatch_bps_time(tg, bio); + if (wait != 0) + return wait; /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. New slice is started only for empty throttle group. - * If there is queued bio, that means there should be an active - * slice and it should be extended instead. + * Charge bps here because @bio will be directly placed into the + * iops queue afterward. */ - if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw, true); - else { - if (time_before(tg->slice_end[rw], - jiffies + tg->td->throtl_slice)) - throtl_extend_slice(tg, rw, - jiffies + tg->td->throtl_slice); - } - - bps_wait = tg_within_bps_limit(tg, bio, bps_limit); - iops_wait = tg_within_iops_limit(tg, bio, iops_limit); - if (bps_wait + iops_wait == 0) { - if (wait) - *wait = 0; - return true; - } - - max_wait = max(bps_wait, iops_wait); - - if (wait) - *wait = max_wait; - - if (time_before(tg->slice_end[rw], jiffies + max_wait)) - throtl_extend_slice(tg, rw, jiffies + max_wait); - - return false; -} - -static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) -{ - bool rw = bio_data_dir(bio); - unsigned int bio_size = throtl_bio_data_size(bio); + throtl_charge_bps_bio(tg, bio); - /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->bytes_disp[rw] += bio_size; - - tg->io_disp[rw]++; + return tg_dispatch_iops_time(tg, bio); } /** @@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ - if (!sq->nr_queued[rw]) + if (sq_queued(sq, rw) == 0) tg->flags |= THROTL_TG_WAS_EMPTY; - throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); + throtl_qnode_add_bio(bio, qn, sq); + + /* + * Since we have split the queues, when the iops queue is + * previously empty and a new @bio is added into the first @qn, + * we also need to update the @tg->disptime. + */ + if (bio_flagged(bio, BIO_BPS_THROTTLED) && + bio == throtl_peek_queued(&sq->queued[rw])) + tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; - sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; - unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) - tg_may_dispatch(tg, bio, &read_wait); + read_wait = tg_dispatch_time(tg, bio); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) - tg_may_dispatch(tg, bio, &write_wait); + write_wait = tg_dispatch_time(tg, bio); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; @@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg) /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; + tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, @@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ - bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); - sq->nr_queued[rw]--; + bio = throtl_pop_queued(sq, &tg_to_put, rw); - throtl_charge_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to @@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], - &parent_sq->queued[rw]); + parent_sq); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } @@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, READ); nr_reads++; @@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; @@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) + if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) tg_update_disptime(tg); else throtl_dequeue_tg(tg); @@ -1037,9 +1149,11 @@ again: dispatched = false; while (true) { + unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); + unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", - sq->nr_queued[READ] + sq->nr_queued[WRITE], - sq->nr_queued[READ], sq->nr_queued[WRITE]); + bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); ret = throtl_select_dispatch(sq); if (ret) { @@ -1061,7 +1175,8 @@ again: if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ @@ -1101,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) - while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) + while ((bio = throtl_pop_queued(td_sq, NULL, rw))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); @@ -1606,11 +1721,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk) static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { - /* throtl is FIFO - if bios are already queued, should queue */ - if (tg->service_queue.nr_queued[rw]) + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * For a split bio, we need to specifically distinguish whether the + * iops queue is empty. + */ + if (bio_flagged(bio, BIO_BPS_THROTTLED)) + return sq->nr_queued_iops[rw] == 0 && + tg_dispatch_iops_time(tg, bio) == 0; + + /* + * Throtl is FIFO - if bios are already queued, should queue. + * If the bps queue is empty and @bio is within the bps limit, charge + * bps here for direct placement into the iops queue. + */ + if (sq_queued(&tg->service_queue, rw)) { + if (sq->nr_queued_bps[rw] == 0 && + tg_dispatch_bps_time(tg, bio) == 0) + throtl_charge_bps_bio(tg, bio); + return false; + } - return tg_may_dispatch(tg, bio, NULL); + return tg_dispatch_time(tg, bio) == 0; } bool __blk_throtl_bio(struct bio *bio) @@ -1631,7 +1765,7 @@ bool __blk_throtl_bio(struct bio *bio) while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); /* * We need to trim slice even when bios are not being @@ -1654,7 +1788,8 @@ bool __blk_throtl_bio(struct bio *bio) * control algorithm is adaptive, and extra IO bytes * will be throttled for paying the debt */ - throtl_charge_bio(tg, bio); + throtl_charge_bps_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); } else { /* if above limits, break to queue */ break; @@ -1680,7 +1815,7 @@ bool __blk_throtl_bio(struct bio *bio) tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), - sq->nr_queued[READ], sq->nr_queued[WRITE]); + sq_queued(sq, READ), sq_queued(sq, WRITE)); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); @@ -1688,11 +1823,13 @@ bool __blk_throtl_bio(struct bio *bio) /* * Update @tg's dispatch time and force schedule dispatch if @tg - * was empty before @bio. The forced scheduling isn't likely to - * cause undue delay as @bio is likely to be dispatched directly if - * its @tg's disptime is not in the future. + * was empty before @bio, or the iops queue is empty and @bio will + * add to. The forced scheduling isn't likely to cause undue + * delay as @bio is likely to be dispatched directly if its @tg's + * disptime is not in the future. */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index f9f8666891ab..3b27755bfbff 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -29,7 +29,8 @@ */ struct throtl_qnode { struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ + struct bio_list bios_bps; /* queued bios for bps limit */ + struct bio_list bios_iops; /* queued bios for iops limit */ struct throtl_grp *tg; /* tg this qnode belongs to */ }; @@ -41,7 +42,8 @@ struct throtl_service_queue { * children throtl_grp's. */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ + unsigned int nr_queued_bps[2]; /* number of queued bps bios */ + unsigned int nr_queued_iops[2]; /* number of queued iops bios */ /* * RB tree of active children throtl_grp's, which are sorted by @@ -54,9 +56,14 @@ struct throtl_service_queue { }; enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + /* + * The sq's iops queue is empty, and a bio is about to be enqueued + * to the first qnode's bios_iops list. + */ + THROTL_TG_IOPS_WAS_EMPTY = 1 << 2, + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; struct throtl_grp { @@ -102,19 +109,16 @@ struct throtl_grp { /* IOPS limits */ unsigned int iops[2]; - /* Number of bytes dispatched in current slice */ - int64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - int io_disp[2]; - /* - * The following two fields are updated when new configuration is - * submitted while some bios are still throttled, they record how many - * bytes/ios are waited already in previous configuration, and they will - * be used to calculate wait time under new configuration. + * Number of bytes/bio's dispatched in current slice. + * When new configuration is submitted while some bios are still throttled, + * first calculate the carryover: the amount of bytes/IOs already waited + * under the previous configuration. Then, [bytes/io]_disp are represented + * as the negative of the carryover, and they will be used to calculate the + * wait time under the new configuration. */ - long long carryover_bytes[2]; - int carryover_ios[2]; + int64_t bytes_disp[2]; + int io_disp[2]; unsigned long last_check_time; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f1754d07f7e0..a50d4cd55f41 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -37,7 +37,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_SWAP = 4, /* write, from swap_writepage() */ + WBT_SWAP = 4, /* write, from swap_writeout() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ @@ -704,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk) struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); - if (q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + mutex_lock(&disk->rqos_state_mutex); + + if (blk_queue_disable_wbt(q)) enable = false; /* Throttling already enabled? */ @@ -713,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk) if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) @@ -774,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk) struct rq_wb *rwb; if (!rqos) return; + mutex_lock(&disk->rqos_state_mutex); rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } + mutex_unlock(&disk->rqos_state_mutex); } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/blk.h b/block/blk.h index 328075787814..37ec459fe656 100644 --- a/block/blk.h +++ b/block/blk.h @@ -103,8 +103,7 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page); + struct page *page, unsigned len, unsigned offset); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) @@ -322,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -int elevator_switch(struct request_queue *q, struct elevator_type *new_e); -void elevator_disable(struct request_queue *q); -void elevator_exit(struct request_queue *q); -int elv_register_queue(struct request_queue *q, bool uevent); -void elv_unregister_queue(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q); +void elevator_set_default(struct request_queue *q); +void elevator_set_none(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -407,6 +404,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, } } +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @paddr: address of the range to add + * @len: maximum length available to add at @paddr + * + * Returns the maximum number of bytes of the range starting at @paddr that can + * be added to a single segment. + */ +static inline unsigned get_max_segment_size(const struct queue_limits *lim, + phys_addr_t paddr, unsigned int len) +{ + /* + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. + */ + return min_t(unsigned long, len, + min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), + (unsigned long)lim->max_segment_size - 1) + 1); +} + int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -421,7 +439,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, int blk_dev_init(void); void update_io_ticks(struct block_device *part, unsigned long now, bool end); -unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { @@ -443,23 +460,6 @@ static inline void ioc_clear_queue(struct request_queue *q) } #endif /* CONFIG_BLK_ICQ */ -struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); - -static inline bool blk_queue_may_bounce(struct request_queue *q) -{ - return IS_ENABLED(CONFIG_BOUNCE) && - (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && - max_low_pfn >= max_pfn; -} - -static inline struct bio *blk_queue_bounce(struct bio *bio, - struct request_queue *q) -{ - if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) - return __blk_queue_bounce(bio, q); - return bio; -} - #ifdef CONFIG_BLK_DEV_ZONED void disk_init_zone_resources(struct gendisk *disk); void disk_free_zone_resources(struct gendisk *disk); @@ -480,7 +480,8 @@ static inline void blk_zone_update_request_bio(struct request *rq, * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) + if (req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) bio->bi_iter.bi_sector = rq->__sector; } void blk_zone_write_plug_bio_endio(struct bio *bio); diff --git a/block/bounce.c b/block/bounce.c deleted file mode 100644 index 09a9616cf209..000000000000 --- a/block/bounce.c +++ /dev/null @@ -1,267 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/gfp.h> -#include <linux/bio-integrity.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <linux/printk.h> -#include <asm/tlbflush.h> - -#include <trace/events/block.h> -#include "blk.h" -#include "blk-cgroup.h" - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool; - -static void init_bounce_bioset(void) -{ - static bool bounce_bs_setup; - int ret; - - if (bounce_bs_setup) - return; - - ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - BUG_ON(ret); - - ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); - BUG_ON(ret); - bounce_bs_setup = true; -} - -static __init int init_emergency_pool(void) -{ - int ret; - -#ifndef CONFIG_MEMORY_HOTPLUG - if (max_pfn <= max_low_pfn) - return 0; -#endif - - ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); - BUG_ON(ret); - pr_info("pool size: %d pages\n", POOL_SIZE); - - init_bounce_bioset(); - return 0; -} - -__initcall(init_emergency_pool); - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - struct bio_vec tovec, fromvec; - struct bvec_iter iter; - /* - * The bio of @from is created by bounce, so we can iterate - * its bvec from start to end, but the @from->bi_iter can't be - * trusted because it might be changed by splitting. - */ - struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; - - bio_for_each_segment(tovec, to, iter) { - fromvec = bio_iter_iovec(from, from_iter); - if (tovec.bv_page != fromvec.bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + - tovec.bv_offset); - } - bio_advance_iter(from, &from_iter, tovec.bv_len); - } -} - -static void bounce_end_io(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, orig_vec; - struct bvec_iter orig_iter = bio_orig->bi_iter; - struct bvec_iter_all iter_all; - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, iter_all) { - orig_vec = bio_iter_iovec(bio_orig, orig_iter); - if (bvec->bv_page != orig_vec.bv_page) { - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, &page_pool); - } - bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); - } - - bio_orig->bi_status = bio->bi_status; - bio_endio(bio_orig); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio) -{ - bounce_end_io(bio); -} - -static void bounce_end_io_read(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - - if (!bio->bi_status) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio); -} - -static struct bio *bounce_clone_bio(struct bio *bio_src) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_VECS biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_VECS. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work. - */ - bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src), - bio_src->bi_opf, GFP_NOIO, &bounce_bio_set); - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); - bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - bio_clone_blkg_association(bio, bio_src); - - return bio; - -err_put: - bio_put(bio); - return NULL; -} - -struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q) -{ - struct bio *bio; - int rw = bio_data_dir(bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i = 0, bytes = 0; - bool bounce = false; - int sectors; - - bio_for_each_segment(from, bio_orig, iter) { - if (i++ < BIO_MAX_VECS) - bytes += from.bv_len; - if (PageHighMem(from.bv_page)) - bounce = true; - } - if (!bounce) - return bio_orig; - - /* - * Individual bvecs might not be logical block aligned. Round down - * the split size so that each bio is properly block size aligned, - * even if we do not use the full hardware limits. - */ - sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> - SECTOR_SHIFT; - if (sectors < bio_sectors(bio_orig)) { - bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split); - bio_chain(bio, bio_orig); - submit_bio_noacct(bio_orig); - bio_orig = bio; - } - bio = bounce_clone_bio(bio_orig); - - /* - * Bvec table can't be updated by bio_for_each_segment_all(), - * so retrieve bvec from the table directly. This way is safe - * because the 'bio' is single-page bvec. - */ - for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *bounce_page; - - if (!PageHighMem(to->bv_page)) - continue; - - bounce_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(bounce_page, NR_BOUNCE); - - if (rw == WRITE) { - flush_dcache_page(to->bv_page); - memcpy_from_bvec(page_address(bounce_page), to); - } - to->bv_page = bounce_page; - } - - trace_block_bio_bounce(bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - else - bio->bi_end_io = bounce_end_io_write; - - bio->bi_private = bio_orig; - return bio; -} diff --git a/block/elevator.c b/block/elevator.c index b4d08026b02c..ab22542e6cf0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,6 +45,17 @@ #include "blk-wbt.h" #include "blk-cgroup.h" +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; +}; + static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -148,18 +159,18 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void elevator_exit(struct request_queue *q) +static void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->elevator_lock); + ioc_clear_queue(q); blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); - - kobject_put(&e->kobj); } static inline void __elv_rqhash_del(struct request *rq) @@ -412,14 +423,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->show) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->show(e, page) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->show(e, page); mutex_unlock(&e->sysfs_lock); return error; } @@ -430,14 +442,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->store) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->store(e, page, length) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->store(e, page, length); mutex_unlock(&e->sysfs_lock); return error; } @@ -452,13 +465,12 @@ static const struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q, bool uevent) +static int elv_register_queue(struct request_queue *q, + struct elevator_queue *e, + bool uevent) { - struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->elevator_lock); - error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { const struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -472,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); + /* + * Sched is initialized, it is ready to export it via + * debugfs + */ + blk_mq_sched_reg_debugfs(q); set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } -void elv_unregister_queue(struct request_queue *q) +static void elv_unregister_queue(struct request_queue *q, + struct elevator_queue *e) { - struct elevator_queue *e = q->elevator; - - lockdep_assert_held(&q->elevator_lock); - if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + /* unexport via debugfs before exiting sched */ + blk_mq_sched_unreg_debugfs(q); } } @@ -548,159 +565,194 @@ void elv_unregister(struct elevator_type *e) EXPORT_SYMBOL_GPL(elv_unregister); /* - * For single queue devices, default to using mq-deadline. If we have multiple - * queues or mq-deadline is not available, default to "none". - */ -static struct elevator_type *elevator_get_default(struct request_queue *q) -{ - if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) - return NULL; - - if (q->nr_hw_queues != 1 && - !blk_mq_is_shared_tags(q->tag_set->flags)) - return NULL; - - return elevator_find_get("mq-deadline"); -} - -/* - * Use the default elevator settings. If the chosen elevator initialization - * fails, fall back to the "none" elevator (no elevator). - */ -void elevator_init_mq(struct request_queue *q) -{ - struct elevator_type *e; - unsigned int memflags; - int err; - - WARN_ON_ONCE(blk_queue_registered(q)); - - if (unlikely(q->elevator)) - return; - - e = elevator_get_default(q); - if (!e) - return; - - /* - * We are called before adding disk, when there isn't any FS I/O, - * so freezing queue plus canceling dispatch work is enough to - * drain any dispatch activities originated from passthrough - * requests, then no need to quiesce queue which may add long boot - * latency, especially when lots of disks are involved. - * - * Disk isn't added yet, so verifying queue lock only manually. - */ - memflags = blk_mq_freeze_queue(q); - - blk_mq_cancel_work_sync(q); - - err = blk_mq_init_sched(q, e); - - blk_mq_unfreeze_queue(q, memflags); - - if (err) { - pr_warn("\"%s\" elevator initialization failed, " - "falling back to \"none\"\n", e->elevator_name); - } - - elevator_put(e); -} - -/* * Switch to new_e io scheduler. * * If switching fails, we are most likely running out of memory and not able * to restore the old io scheduler, so leaving the io scheduler being none. */ -int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) { - unsigned int memflags; - int ret; + struct elevator_type *new_e = NULL; + int ret = 0; + WARN_ON_ONCE(q->mq_freeze_depth == 0); lockdep_assert_held(&q->elevator_lock); - memflags = blk_mq_freeze_queue(q); + if (strncmp(ctx->name, "none", 4)) { + new_e = elevator_find_get(ctx->name); + if (!new_e) + return -EINVAL; + } + blk_mq_quiesce_queue(q); if (q->elevator) { - elv_unregister_queue(q); + ctx->old = q->elevator; elevator_exit(q); } - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out_unfreeze; - - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out_unfreeze; + if (new_e) { + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + ctx->new = q->elevator; + } else { + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", ctx->name); out_unfreeze: blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q, memflags); if (ret) { pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", new_e->elevator_name); } + if (new_e) + elevator_put(new_e); return ret; } -void elevator_disable(struct request_queue *q) +static void elv_exit_and_release(struct request_queue *q) { - unsigned int memflags; - - lockdep_assert_held(&q->elevator_lock); + struct elevator_queue *e; + unsigned memflags; memflags = blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - elv_unregister_queue(q); + mutex_lock(&q->elevator_lock); + e = q->elevator; elevator_exit(q); - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - blk_add_trace_msg(q, "elv switch: none"); - - blk_mq_unquiesce_queue(q); + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + if (e) + kobject_put(&e->kobj); +} + +static int elevator_change_done(struct request_queue *q, + struct elv_change_ctx *ctx) +{ + int ret = 0; + + if (ctx->old) { + bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, + &ctx->old->flags); + + elv_unregister_queue(q, ctx->old); + kobject_put(&ctx->old->kobj); + if (enable_wbt) + wbt_enable_default(q->disk); + } + if (ctx->new) { + ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); + if (ret) + elv_exit_and_release(q); + } + return ret; } /* * Switch this queue to the given IO scheduler. */ -static int elevator_change(struct request_queue *q, const char *elevator_name) +static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { - struct elevator_type *e; - int ret; + unsigned int memflags; + int ret = 0; - /* Make sure queue is not in the middle of being removed */ - if (!blk_queue_registered(q)) - return -ENOENT; + lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + + memflags = blk_mq_freeze_queue(q); + /* + * May be called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + * + * Disk isn't added yet, so verifying queue lock only manually. + */ + blk_mq_cancel_work_sync(q); + mutex_lock(&q->elevator_lock); + if (!(q->elevator && elevator_match(q->elevator->type, ctx->name))) + ret = elevator_switch(q, ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + if (!ret) + ret = elevator_change_done(q, ctx); + + return ret; +} + +/* + * The I/O scheduler depends on the number of hardware queues, this forces a + * reattachment when nr_hw_queues changes. + */ +void elv_update_nr_hw_queues(struct request_queue *q) +{ + struct elv_change_ctx ctx = {}; + int ret = -ENODEV; + + WARN_ON_ONCE(q->mq_freeze_depth == 0); - if (!strncmp(elevator_name, "none", 4)) { - if (q->elevator) - elevator_disable(q); - return 0; + mutex_lock(&q->elevator_lock); + if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = q->elevator->type->elevator_name; + + /* force to reattach elevator after nr_hw_queue is updated */ + ret = elevator_switch(q, &ctx); } + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue_nomemrestore(q); + if (!ret) + WARN_ON_ONCE(elevator_change_done(q, &ctx)); +} - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) - return 0; +/* + * Use the default elevator settings. If the chosen elevator initialization + * fails, fall back to the "none" elevator (no elevator). + */ +void elevator_set_default(struct request_queue *q) +{ + struct elv_change_ctx ctx = { + .name = "mq-deadline", + .no_uevent = true, + }; + int err = 0; - e = elevator_find_get(elevator_name); - if (!e) - return -EINVAL; - ret = elevator_switch(q, e); - elevator_put(e); - return ret; + /* now we allow to switch elevator */ + blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); + + if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return; + + /* + * For single queue devices, default to using mq-deadline. If we + * have multiple queues or mq-deadline is not available, default + * to "none". + */ + if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(q->tag_set->flags))) + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("\"%s\" elevator initialization, failed %d, " + "falling back to \"none\"\n", ctx.name, err); } -static void elv_iosched_load_module(char *elevator_name) +void elevator_set_none(struct request_queue *q) +{ + struct elv_change_ctx ctx = { + .name = "none", + }; + int err; + + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("%s: set none elevator failed %d\n", __func__, err); +} + +static void elv_iosched_load_module(const char *elevator_name) { struct elevator_type *found; @@ -716,10 +768,14 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; - char *name; + struct elv_change_ctx ctx = {}; int ret; - unsigned int memflags; struct request_queue *q = disk->queue; + struct blk_mq_tag_set *set = q->tag_set; + + /* Make sure queue is not in the middle of being removed */ + if (!blk_queue_registered(q)) + return -ENOENT; /* * If the attribute needs to load a module, do it before freezing the @@ -727,24 +783,25 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, * queue is the one for the device storing the module file. */ strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); + ctx.name = strstrip(elevator_name); - elv_iosched_load_module(name); + elv_iosched_load_module(ctx.name); - memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); - ret = elevator_change(q, name); - if (!ret) - ret = count; - mutex_unlock(&q->elevator_lock); - blk_mq_unfreeze_queue(q, memflags); + down_read(&set->update_nr_hwq_lock); + if (!blk_queue_no_elv_switch(q)) { + ret = elevator_change(q, &ctx); + if (!ret) + ret = count; + } else { + ret = -ENOENT; + } + up_read(&set->update_nr_hwq_lock); return ret; } ssize_t elv_iosched_show(struct gendisk *disk, char *name) { struct request_queue *q = disk->queue; - struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; @@ -753,7 +810,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); - cur = eq->type; + cur = q->elevator->type; } spin_lock(&elv_list_lock); diff --git a/block/elevator.h b/block/elevator.h index e4e44dfac503..a07ce773a38f 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -121,7 +121,8 @@ struct elevator_queue }; #define ELEVATOR_FLAG_REGISTERED 0 -#define ELEVATOR_FLAG_DISABLE_WBT 1 +#define ELEVATOR_FLAG_DYING 1 +#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface @@ -182,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) +void blk_mq_sched_reg_debugfs(struct request_queue *q); +void blk_mq_sched_unreg_debugfs(struct request_queue *q); + #endif /* _ELEVATOR_H */ diff --git a/block/fops.c b/block/fops.c index 82b672d15ea4..1309861d4c2c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; @@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; + if (iov_iter_rw(iter) == WRITE) { + u16 max_write_streams = bdev_max_write_streams(bdev); + + if (iocb->ki_write_stream) { + if (iocb->ki_write_stream > max_write_streams) + return -EINVAL; + } else if (max_write_streams) { + enum rw_hint write_hint = + file_inode(iocb->ki_filp)->i_write_hint; + + /* + * Just use the write hint as write stream for block + * device writes. This assumes no file system is + * mounted that would use the streams differently. + */ + if (write_hint <= max_write_streams) + iocb->ki_write_stream = write_hint; + } + } + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) @@ -451,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); - err = write_cache_pages(mapping, wbc, block_write_full_folio, - blkdev_get_block); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) + err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; diff --git a/block/genhd.c b/block/genhd.c index c2bd86cd09de..8171a6bc3210 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -125,37 +125,46 @@ static void part_stat_read_all(struct block_device *part, } } -unsigned int part_in_flight(struct block_device *part) +static void bdev_count_inflight_rw(struct block_device *part, + unsigned int inflight[2], bool mq_driver) { - unsigned int inflight = 0; int cpu; - for_each_possible_cpu(cpu) { - inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + - part_stat_local_read_cpu(part, in_flight[1], cpu); + if (mq_driver) { + blk_mq_in_driver_rw(part, inflight); + } else { + for_each_possible_cpu(cpu) { + inflight[READ] += part_stat_local_read_cpu( + part, in_flight[READ], cpu); + inflight[WRITE] += part_stat_local_read_cpu( + part, in_flight[WRITE], cpu); + } } - if ((int)inflight < 0) - inflight = 0; - return inflight; + if (WARN_ON_ONCE((int)inflight[READ] < 0)) + inflight[READ] = 0; + if (WARN_ON_ONCE((int)inflight[WRITE] < 0)) + inflight[WRITE] = 0; } -static void part_in_flight_rw(struct block_device *part, - unsigned int inflight[2]) +/** + * bdev_count_inflight - get the number of inflight IOs for a block device. + * + * @part: the block device. + * + * Inflight here means started IO accounting, from bdev_start_io_acct() for + * bio-based block device, and from blk_account_io_start() for rq-based block + * device. + */ +unsigned int bdev_count_inflight(struct block_device *part) { - int cpu; + unsigned int inflight[2] = {0}; - inflight[0] = 0; - inflight[1] = 0; - for_each_possible_cpu(cpu) { - inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); - inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); - } - if ((int)inflight[0] < 0) - inflight[0] = 0; - if ((int)inflight[1] < 0) - inflight[1] = 0; + bdev_count_inflight_rw(part, inflight, false); + + return inflight[READ] + inflight[WRITE]; } +EXPORT_SYMBOL_GPL(bdev_count_inflight); /* * Can be deleted altogether. Later. @@ -389,19 +398,35 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) return ret; } -/** - * add_disk_fwnode - add disk information to kernel list with fwnode - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * @fwnode: attached disk fwnode - * - * This function registers the partitioning information in @disk - * with the kernel. Also attach a fwnode to the disk device. - */ -int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - struct fwnode_handle *fwnode) +static void add_disk_final(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + /* Make sure the first partition scan will be proceed */ + if (get_capacity(disk) && disk_has_partscan(disk)) + set_bit(GD_NEED_PART_SCAN, &disk->state); + + bdev_add(disk->part0, ddev->devt); + if (get_capacity(disk)) + disk_scan_partitions(disk, BLK_OPEN_READ); + + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + } + + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); + disk_add_events(disk); + set_bit(GD_ADDED, &disk->state); +} + +static int __add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); @@ -416,12 +441,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, */ if (disk->fops->submit_bio || disk->fops->poll_bio) return -EINVAL; - - /* - * Initialize the I/O scheduler code and pick a default one if - * needed. - */ - elevator_init_mq(disk->queue); } else { if (!disk->fops->submit_bio) return -EINVAL; @@ -438,7 +457,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) - goto out_exit_elevator; + goto out; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -448,14 +467,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) - goto out_exit_elevator; + goto out; } else { if (WARN_ON(disk->minors)) - goto out_exit_elevator; + goto out; ret = blk_alloc_ext_minor(); if (ret < 0) - goto out_exit_elevator; + goto out; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } @@ -516,21 +535,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; - - /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && disk_has_partscan(disk)) - set_bit(GD_NEED_PART_SCAN, &disk->state); - - bdev_add(disk->part0, ddev->devt); - if (get_capacity(disk)) - disk_scan_partitions(disk, BLK_OPEN_READ); - - /* - * Announce the disk and partitions after all partitions are - * created. (for hidden disks uevents remain suppressed forever) - */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not @@ -539,10 +543,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - - blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); - disk_add_events(disk); - set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: @@ -564,12 +564,46 @@ out_device_del: out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); -out_exit_elevator: - if (disk->queue->elevator) { - mutex_lock(&disk->queue->elevator_lock); - elevator_exit(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); +out: + return ret; +} + +/** + * add_disk_fwnode - add disk information to kernel list with fwnode + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * @fwnode: attached disk fwnode + * + * This function registers the partitioning information in @disk + * with the kernel. Also attach a fwnode to the disk device. + */ +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + int ret; + + if (queue_is_mq(disk->queue)) { + set = disk->queue->tag_set; + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + ret = __add_disk(parent, disk, groups, fwnode); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } else { + ret = __add_disk(parent, disk, groups, fwnode); } + + /* + * add_disk_final() needn't to read `nr_hw_queues`, so move it out + * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary + * lock dependency on `disk->open_mutex` from scanning partition. + */ + if (!ret) + add_disk_final(disk); return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -652,26 +686,7 @@ void blk_mark_disk_dead(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); -/** - * del_gendisk - remove the gendisk - * @disk: the struct gendisk to remove - * - * Removes the gendisk and all its associated resources. This deletes the - * partitions associated with the gendisk, and unregisters the associated - * request_queue. - * - * This is the counter to the respective __device_add_disk() call. - * - * The final removal of the struct gendisk happens when its refcount reaches 0 - * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. - * - * Drivers exist which depend on the release of the gendisk to be synchronous, - * it should not be deferred. - * - * Context: can sleep - */ -void del_gendisk(struct gendisk *disk) +static void __del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; @@ -743,14 +758,7 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); - blk_mq_quiesce_queue(q); - if (q->elevator) { - mutex_lock(&q->elevator_lock); - elevator_exit(q); - mutex_unlock(&q->elevator_lock); - } rq_qos_exit(q); - blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests @@ -764,6 +772,55 @@ void del_gendisk(struct gendisk *disk) if (start_drain) blk_unfreeze_release_lock(q); } + +static void disable_elv_switch(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + WARN_ON_ONCE(!queue_is_mq(q)); + + down_write(&set->update_nr_hwq_lock); + blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q); + up_write(&set->update_nr_hwq_lock); +} + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + + if (!queue_is_mq(disk->queue)) { + __del_gendisk(disk); + } else { + set = disk->queue->tag_set; + + disable_elv_switch(disk->queue); + + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + __del_gendisk(disk); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } +} EXPORT_SYMBOL(del_gendisk); /** @@ -1005,7 +1062,7 @@ ssize_t part_stat_show(struct device *dev, struct disk_stats stat; unsigned int inflight; - inflight = part_in_flight(bdev); + inflight = bdev_count_inflight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); @@ -1042,19 +1099,21 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } +/* + * Show the number of IOs issued to driver. + * For bio-based device, started from bdev_start_io_acct(); + * For rq-based device, started from blk_mq_start_request(); + */ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); - unsigned int inflight[2]; + unsigned int inflight[2] = {0}; - if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, bdev, inflight); - else - part_in_flight_rw(bdev, inflight); + bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q)); - return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); + return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } static ssize_t disk_capability_show(struct device *dev, @@ -1307,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - inflight = part_in_flight(hd); + inflight = bdev_count_inflight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); @@ -1422,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif + mutex_init(&disk->rqos_state_mutex); return disk; out_erase_part0: diff --git a/block/ioprio.c b/block/ioprio.c index 73301a261429..f0ee2798539c 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio) */ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) return -EPERM; - fallthrough; - /* rt has prio field too */ - case IOPRIO_CLASS_BE: - if (level >= IOPRIO_NR_LEVELS) - return -EINVAL; break; + case IOPRIO_CLASS_BE: case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 754f6b7415cd..2edf1cac06d5 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -715,7 +715,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } /* - * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). + * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, |