diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-26 18:08:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-26 18:08:55 -0700 |
commit | 9b960d8cd6f712cb2c03e2bdd4d5ca058238037f (patch) | |
tree | a1381af6c79626c0a28679f804477b43b7c91565 /block | |
parent | 91928e0d3cc29789f4483bffee5f36218f23942b (diff) | |
parent | 3c9f0c9326b625bf008962d58996f89a3bba1e12 (diff) |
Merge tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- Fixes for integrity handling
- NVMe pull request via Keith:
- Secure concatenation for TCP transport (Hannes)
- Multipath sysfs visibility (Nilay)
- Various cleanups (Qasim, Baruch, Wang, Chen, Mike, Damien, Li)
- Correct use of 64-bit BARs for pci-epf target (Niklas)
- Socket fix for selinux when used in containers (Peijie)
- MD pull request via Yu:
- fix recovery can preempt resync (Li Nan)
- fix md-bitmap IO limit (Su Yue)
- fix raid10 discard with REQ_NOWAIT (Xiao Ni)
- fix raid1 memory leak (Zheng Qixing)
- fix mddev uaf (Yu Kuai)
- fix raid1,raid10 IO flags (Yu Kuai)
- some refactor and cleanup (Yu Kuai)
- Series cleaning up and fixing bugs in the bad block handling code
- Improve support for write failure simulation in null_blk
- Various lock ordering fixes
- Fixes for locking for debugfs attributes
- Various ublk related fixes and improvements
- Cleanups for blk-rq-qos wait handling
- blk-throttle fixes
- Fixes for loop dio and sync handling
- Fixes and cleanups for the auto-PI code
- Block side support for hardware encryption keys in blk-crypto
- Various cleanups and fixes
* tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux: (105 commits)
nvmet: replace max(a, min(b, c)) by clamp(val, lo, hi)
nvme-tcp: fix selinux denied when calling sock_sendmsg
nvmet: pci-epf: Always configure BAR0 as 64-bit
nvmet: Remove duplicate uuid_copy
nvme: zns: Simplify nvme_zone_parse_entry()
nvmet: pci-epf: Remove redundant 'flush_workqueue()' calls
nvmet-fc: Remove unused functions
nvme-pci: remove stale comment
nvme-fc: Utilise min3() to simplify queue count calculation
nvme-multipath: Add visibility for queue-depth io-policy
nvme-multipath: Add visibility for numa io-policy
nvme-multipath: Add visibility for round-robin io-policy
nvmet: add tls_concat and tls_key debugfs entries
nvmet-tcp: support secure channel concatenation
nvmet: Add 'sq' argument to alloc_ctrl_args
nvme-fabrics: reset admin connection for secure concatenation
nvme-tcp: request secure channel concatenation
nvme-keyring: add nvme_tls_psk_refresh()
nvme: add nvme_auth_derive_tls_psk()
nvme: add nvme_auth_generate_digest()
...
Diffstat (limited to 'block')
39 files changed, 1180 insertions, 800 deletions
diff --git a/block/Makefile b/block/Makefile index 33748123710b..3a941dc0d27f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ + bio-integrity-auto.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/badblocks.c b/block/badblocks.c index db4ec8b9b2a8..ece64e76fe8f 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -528,51 +528,6 @@ out: } /* - * Return 'true' if the range indicated by 'bad' can be backward merged - * with the bad range (from the bad table) index by 'behind'. - */ -static bool can_merge_behind(struct badblocks *bb, - struct badblocks_context *bad, int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - - if ((s < BB_OFFSET(p[behind])) && - ((s + sectors) >= BB_OFFSET(p[behind])) && - ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && - BB_ACK(p[behind]) == bad->ack) - return true; - return false; -} - -/* - * Do backward merge for range indicated by 'bad' and the bad range - * (from the bad table) indexed by 'behind'. The return value is merged - * sectors from bad->len. - */ -static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, - int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - int merged = 0; - - WARN_ON(s >= BB_OFFSET(p[behind])); - WARN_ON((s + sectors) < BB_OFFSET(p[behind])); - - if (s < BB_OFFSET(p[behind])) { - merged = BB_OFFSET(p[behind]) - s; - p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); - - WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); - } - - return merged; -} - -/* * Return 'true' if the range indicated by 'bad' can be forward * merged with the bad range (from the bad table) indexed by 'prev'. */ @@ -745,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev, *extra = 2; } - if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + if ((bb->count + (*extra)) > MAX_BADBLOCKS) return false; return true; @@ -855,40 +810,60 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool try_adjacent_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + if (prev >= 0 && (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + return true; + } + return false; +} + /* Do exact work to set bad block range into the bad block table */ -static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { - int retried = 0, space_desired = 0; - int orig_len, len = 0, added = 0; + int len = 0, added = 0; struct badblocks_context bad; int prev = -1, hint = -1; - sector_t orig_start; unsigned long flags; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - rounddown(s, bb->shift); - roundup(next, bb->shift); + rounddown(s, 1 << bb->shift); + roundup(next, 1 << bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); - orig_start = s; - orig_len = sectors; bad.ack = acknowledged; p = bb->page; @@ -897,6 +872,9 @@ re_insert: bad.len = sectors; len = 0; + if (badblocks_full(bb)) + goto out; + if (badblocks_empty(bb)) { len = insert_at(bb, 0, &bad); bb->count++; @@ -908,32 +886,14 @@ re_insert: /* start before all badblocks */ if (prev < 0) { - if (!badblocks_full(bb)) { - /* insert on the first */ - if (bad.len > (BB_OFFSET(p[0]) - bad.start)) - bad.len = BB_OFFSET(p[0]) - bad.start; - len = insert_at(bb, 0, &bad); - bb->count++; - added++; - hint = 0; - goto update_sectors; - } - - /* No sapce, try to merge */ - if (overlap_behind(bb, &bad, 0)) { - if (can_merge_behind(bb, &bad, 0)) { - len = behind_merge(bb, &bad, 0); - added++; - } else { - len = BB_OFFSET(p[0]) - s; - space_desired = 1; - } - hint = 0; - goto update_sectors; - } - - /* no table space and give up */ - goto out; + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = ++prev; + goto update_sectors; } /* in case p[prev-1] can be merged with p[prev] */ @@ -945,33 +905,6 @@ re_insert: goto update_sectors; } - if (overlap_front(bb, prev, &bad)) { - if (can_merge_front(bb, prev, &bad)) { - len = front_merge(bb, prev, &bad); - added++; - } else { - int extra = 0; - - if (!can_front_overwrite(bb, prev, &bad, &extra)) { - len = min_t(sector_t, - BB_END(p[prev]) - s, sectors); - hint = prev; - goto update_sectors; - } - - len = front_overwrite(bb, prev, &bad, extra); - added++; - bb->count += extra; - - if (can_combine_front(bb, prev, &bad)) { - front_combine(bb, prev); - bb->count--; - } - } - hint = prev; - goto update_sectors; - } - if (can_merge_front(bb, prev, &bad)) { len = front_merge(bb, prev, &bad); added++; @@ -979,21 +912,29 @@ re_insert: goto update_sectors; } - /* if no space in table, still try to merge in the covered range */ - if (badblocks_full(bb)) { - /* skip the cannot-merge range */ - if (((prev + 1) < bb->count) && - overlap_behind(bb, &bad, prev + 1) && - ((s + sectors) >= BB_END(p[prev + 1]))) { - len = BB_END(p[prev + 1]) - s; - hint = prev + 1; + if (overlap_front(bb, prev, &bad)) { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + if (extra > 0) + goto out; + + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; goto update_sectors; } - /* no retry any more */ - len = sectors; - space_desired = 1; - hint = -1; + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + } + + hint = prev; goto update_sectors; } @@ -1006,7 +947,7 @@ re_insert: len = insert_at(bb, prev + 1, &bad); bb->count++; added++; - hint = prev + 1; + hint = ++prev; update_sectors: s += len; @@ -1015,35 +956,12 @@ update_sectors: if (sectors > 0) goto re_insert; - WARN_ON(sectors < 0); - /* * Check whether the following already set range can be * merged. (prev < 0) condition is not handled here, * because it's already complicated enough. */ - if (prev >= 0 && - (prev + 1) < bb->count && - BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && - (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && - BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { - p[prev] = BB_MAKE(BB_OFFSET(p[prev]), - BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), - BB_ACK(p[prev])); - - if ((prev + 2) < bb->count) - memmove(p + prev + 1, p + prev + 2, - (bb->count - (prev + 2)) * 8); - bb->count--; - } - - if (space_desired && !badblocks_full(bb)) { - s = orig_start; - sectors = orig_len; - space_desired = 0; - if (retried++ < 3) - goto re_insert; - } + try_adjacent_combine(bb, prev); out: if (added) { @@ -1057,10 +975,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - if (!added) - rv = 1; - - return rv; + return sectors == 0; } /* @@ -1131,21 +1046,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { struct badblocks_context bad; int prev = -1, hint = -1; int len = 0, cleared = 0; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { sector_t target; @@ -1157,8 +1071,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) * isn't than to think a block is not bad when it is. */ target = s + sectors; - roundup(s, bb->shift); - rounddown(target, bb->shift); + roundup(s, 1 << bb->shift); + rounddown(target, 1 << bb->shift); sectors = target - s; } @@ -1214,7 +1128,7 @@ re_clear: if ((BB_OFFSET(p[prev]) < bad.start) && (BB_END(p[prev]) > (bad.start + bad.len))) { /* Splitting */ - if ((bb->count + 1) < MAX_BADBLOCKS) { + if ((bb->count + 1) <= MAX_BADBLOCKS) { len = front_splitting_clear(bb, prev, &bad); bb->count += 1; cleared++; @@ -1255,8 +1169,6 @@ update_sectors: if (sectors > 0) goto re_clear; - WARN_ON(sectors < 0); - if (cleared) { badblocks_update_acked(bb); set_changed(bb); @@ -1265,40 +1177,21 @@ update_sectors: write_sequnlock_irq(&bb->lock); if (!cleared) - rv = 1; + return false; - return rv; + return true; } /* Do the exact work to check bad blocks range from the bad block table */ -static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - int unacked_badblocks, acked_badblocks; int prev = -1, hint = -1, set = 0; struct badblocks_context bad; - unsigned int seq; + int unacked_badblocks = 0; + int acked_badblocks = 0; + u64 *p = bb->page; int len, rv; - u64 *p; - - WARN_ON(bb->shift < 0 || sectors == 0); - - if (bb->shift > 0) { - sector_t target; - - /* round the start down, and the end up */ - target = s + sectors; - rounddown(s, bb->shift); - roundup(target, bb->shift); - sectors = target - s; - } - -retry: - seq = read_seqbegin(&bb->lock); - - p = bb->page; - unacked_badblocks = 0; - acked_badblocks = 0; re_check: bad.start = s; @@ -1349,14 +1242,15 @@ re_check: len = sectors; update_sectors: + /* This situation should never happen */ + WARN_ON(sectors < len); + s += len; sectors -= len; if (sectors > 0) goto re_check; - WARN_ON(sectors < 0); - if (unacked_badblocks > 0) rv = -1; else if (acked_badblocks > 0) @@ -1364,9 +1258,6 @@ update_sectors: else rv = 0; - if (read_seqretry(&bb->lock, seq)) - goto retry; - return rv; } @@ -1404,10 +1295,30 @@ update_sectors: * -1: there are bad blocks which have not yet been acknowledged in metadata. * plus the start/length of the first bad section we overlap. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + unsigned int seq; + int rv; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + /* round the start down, and the end up */ + sector_t target = s + sectors; + + rounddown(s, 1 << bb->shift); + roundup(target, 1 << bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; } EXPORT_SYMBOL_GPL(badblocks_check); @@ -1423,11 +1334,12 @@ EXPORT_SYMBOL_GPL(badblocks_check); * decide how best to handle it. * * Return: - * 0: success - * 1: failed to set badblocks (out of space) + * true: success + * false: failed to set badblocks (out of space). Parital setting will be + * treated as failure. */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); } @@ -1444,10 +1356,10 @@ EXPORT_SYMBOL_GPL(badblocks_set); * drop the remove request. * * Return: - * 0: success - * 1: failed to clear badblocks + * true: success + * false: failed to clear badblocks */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { return _badblocks_clear(bb, s, sectors); } @@ -1479,6 +1391,11 @@ void ack_all_badblocks(struct badblocks *bb) p[i] = BB_MAKE(start, len, 1); } } + + for (i = 0; i < bb->count ; i++) + while (try_adjacent_combine(bb, i)) + ; + bb->unacked_exist = 0; } write_sequnlock_irq(&bb->lock); @@ -1564,10 +1481,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, return -EINVAL; } - if (badblocks_set(bb, sector, length, !unack)) + if (!badblocks_set(bb, sector, length, !unack)) return -ENOSPC; - else - return len; + + return len; } EXPORT_SYMBOL_GPL(badblocks_store); diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c new file mode 100644 index 000000000000..e524c609be50 --- /dev/null +++ b/block/bio-integrity-auto.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * Automatically generate and verify integrity data on PI capable devices if the + * bio submitter didn't provide PI itself. This ensures that kernel verifies + * data integrity even if the file system (or other user of the block device) is + * not aware of PI. + */ +#include <linux/blk-integrity.h> +#include <linux/workqueue.h> +#include "blk.h" + +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; +static struct workqueue_struct *kintegrityd_wq; + +static void bio_integrity_finish(struct bio_integrity_data *bid) +{ + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} + +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; + + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); + bio_endio(bio); +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Normally I/O completion is done in interrupt context. However, verifying I/O + * integrity is a time-consuming task which must be run in process context. + * + * This function postpones completion accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); + return false; + } + + bio_integrity_finish(bid); + return true; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Checks if the bio already has an integrity payload attached. If it does, the + * payload has been generated by another kernel subsystem, and we just pass it + * through. + * Otherwise allocates integrity payload and for writes the integrity metadata + * will be generated. For reads, the completion handler will verify the + * metadata. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; + gfp_t gfp = GFP_NOIO; + unsigned int len; + void *buf; + + if (!bi) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) + return true; + break; + case REQ_OP_WRITE: + if (bi->flags & BLK_INTEGRITY_NOGENERATE) + return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; + } + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + + /* Allocate kernel buffer for protection data */ + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, gfp); + if (!buf) + goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); + + bid->bio = bio; + + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); + + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bid->bip.bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bid->bip.bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bid->bip.bip_flags |= BIP_CHECK_REFTAG; + + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) + goto err_end_io; + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(bio); + else + bid->saved_bio_iter = bio->bi_iter; + return true; + +err_free_buf: + kfree(buf); +err_end_io: + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return false; +} +EXPORT_SYMBOL(bio_integrity_prep); + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static int __init blk_integrity_auto_init(void) +{ + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + return 0; +} +subsys_initcall(blk_integrity_auto_init); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5d81ad9a3d20..608594a154a5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,20 +7,12 @@ */ #include <linux/blk-integrity.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> #include "blk.h" -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -void blk_flush_integrity(void) -{ - flush_workqueue(kintegrityd_wq); -} +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; /** * bio_integrity_free - Free bio integrity payload @@ -30,21 +22,23 @@ void blk_flush_integrity(void) */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + kfree(bio_integrity(bio)); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) +{ + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -59,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else if (nr_vecs) { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - if (bs && mempool_initialized(&bs->bio_integrity_pool)) - mempool_free(bip, &bs->bio_integrity_pool); - else - kfree(bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -414,149 +376,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) } /** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Checks if the bio already has an integrity payload attached. - * If it does, the payload has been generated by another kernel subsystem, - * and we just pass it through. Otherwise allocates integrity payload. - * The bio must have data direction, target device and start sector set priot - * to calling. In the WRITE case, integrity metadata will be generated using - * the block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -bool bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - unsigned int len; - void *buf; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; - break; - case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (unlikely(buf == NULL)) { - goto err_end_io; - } - - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } - - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); - - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; - - /* describe what tags to check in payload */ - if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - blk_integrity_generate(bio); - else - bip->bio_iter = bio->bi_iter; - return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - - blk_integrity_verify(bio); - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - bio_endio(bio); -} - -/** - * __bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -bool __bio_integrity_endio(struct bio *bio) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); - return false; - } - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - return true; -} - -/** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed @@ -617,44 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index 6ac5983ba51e..4e6c85a33d74 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1026,9 +1026,10 @@ EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { + unsigned long nr = off / PAGE_SIZE; + WARN_ON_ONCE(len > UINT_MAX); - WARN_ON_ONCE(off > UINT_MAX); - __bio_add_page(bio, &folio->page, len, off); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); @@ -1049,9 +1050,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail); bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { - if (len > UINT_MAX || off > UINT_MAX) + unsigned long nr = off / PAGE_SIZE; + + if (len > UINT_MAX) return false; - return bio_add_page(bio, &folio->page, len, off) > 0; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); @@ -1657,7 +1660,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1737,8 +1739,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1754,9 +1754,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1994a11ff903..5905f277057b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -816,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -972,6 +1007,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) +{ + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; + + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + } +} + static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; @@ -1728,27 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* - * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy - * without pd_alloc_fn/pd_free_fn can't be activated. - */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1759,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2c4663bd993a..81868ad86330 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -219,9 +219,11 @@ struct blkg_conf_ctx { void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-core.c b/block/blk-core.c index d6c4fa3943b5..4623de79effa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); @@ -455,6 +456,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); + /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ + fs_reclaim_acquire(GFP_KERNEL); + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); + fs_reclaim_release(GFP_KERNEL); + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 29a205482617..f154be0b575a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split; * This is the key we set when evicting a keyslot. This *should* be the all 0's * key, but AES-XTS rejects that key, so we use some random bytes instead. */ -static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; +static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -539,7 +539,7 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, sizeof(blank_key)); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) @@ -561,6 +561,7 @@ static int blk_crypto_fallback_init(void) blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 93a141979694..ccf6dff6ff6b 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -14,6 +14,7 @@ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ + unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; @@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq) return false; } +static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 7fabc883e39f..94a155912bf1 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return false; if (profile->max_dun_bytes_supported < cfg->dun_bytes) return false; + if (!(profile->key_types_supported & cfg->key_type)) + return false; return true; } @@ -463,6 +465,99 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, EXPORT_SYMBOL_GPL(blk_crypto_register); /** + * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key + * @bdev: a block device that supports hardware-wrapped keys + * @eph_key: a hardware-wrapped key in ephemerally-wrapped form + * @eph_key_size: size of @eph_key in bytes + * @sw_secret: (output) the software secret + * + * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that + * it is used for I/O), ask the hardware to derive the secret which software can + * use for cryptographic tasks other than inline encryption. This secret is + * guaranteed to be cryptographically isolated from the inline encryption key, + * i.e. derived with a different KDF context. + * + * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support + * hardware-wrapped keys, -EBADMSG if the key isn't a valid + * ephemerally-wrapped key, or another -errno code. + */ +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.derive_sw_secret) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size, + sw_secret); + blk_crypto_hw_exit(profile); + return err; +} + +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.import_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size, + lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.generate_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.generate_key(profile, lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.prepare_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size, + eph_key); + blk_crypto_hw_exit(profile); + return ret; +} + +/** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device * @parent: the crypto profile for the parent device @@ -485,10 +580,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, child->max_dun_bytes_supported); for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) parent->modes_supported[i] &= child->modes_supported[i]; + parent->key_types_supported &= child->key_types_supported; } else { parent->max_dun_bytes_supported = 0; memset(parent->modes_supported, 0, sizeof(parent->modes_supported)); + parent->key_types_supported = 0; } } EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); @@ -521,6 +618,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, target->max_dun_bytes_supported) return false; + if (reference->key_types_supported & ~target->key_types_supported) + return false; + return true; } EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); @@ -555,5 +655,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, sizeof(dst->modes_supported)); dst->max_dun_bytes_supported = src->max_dun_bytes_supported; + dst->key_types_supported = src->key_types_supported; } EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index a304434489ba..e832f403f200 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) return container_of(attr, struct blk_crypto_attr, attr); } +static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { @@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, return sysfs_emit(page, "%u\n", profile->num_slots); } +static ssize_t raw_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + #define BLK_CRYPTO_RO_ATTR(_name) \ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) +BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); BLK_CRYPTO_RO_ATTR(num_keyslots); +BLK_CRYPTO_RO_ATTR(raw_keys); + +static umode_t blk_crypto_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + if (a == &hw_wrapped_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return 0; + if (a == &raw_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW)) + return 0; + + return 0444; +} static struct attribute *blk_crypto_attrs[] = { + &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, + &raw_keys_attr.attr, NULL, }; static const struct attribute_group blk_crypto_attr_group = { .attrs = blk_crypto_attrs, + .is_visible = blk_crypto_is_visible, }; /* diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 4d760b092deb..4b1ad84d1b5a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, + .security_strength = 16, .ivsize = 16, }, }; @@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void) /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); - /* Sanity check that no algorithm exceeds the defined limits. */ + /* + * Validate the crypto mode properties. This ideally would be done with + * static assertions, but boot-time checks are the next best thing. + */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { - BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].keysize > + BLK_CRYPTO_MAX_RAW_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].security_strength > + blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } @@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. - * @raw_key: Pointer to the raw key. Must be the correct length for the chosen - * @crypto_mode; see blk_crypto_modes[]. + * @key_bytes: the bytes of the key + * @key_size: size of the key in bytes + * @key_type: type of the key -- either raw or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for - * zeroizing both blk_key and raw_key when done with them. + * zeroizing both blk_key and key_bytes when done with them. */ -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) @@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; - if (mode->keysize == 0) + switch (key_type) { + case BLK_CRYPTO_KEY_TYPE_RAW: + if (key_size != mode->keysize) + return -EINVAL; + break; + case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: + if (key_size < mode->security_strength || + key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + return -EINVAL; + break; + default: return -EINVAL; + } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; @@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); - blk_key->size = mode->keysize; - memcpy(blk_key->raw, raw_key, mode->keysize); + blk_key->size = key_size; + memcpy(blk_key->bytes, key_bytes, key_size); return 0; } @@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { - return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_crypto_config_supported_natively(bdev, cfg); + if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && + cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW) + return true; + return blk_crypto_config_supported_natively(bdev, cfg); } /** @@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev, * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and - * blk-crypto-fallback is either disabled or the needed algorithm - * is disabled in the crypto API; or another -errno code. + * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does + * not support wrapped keys; -ENOPKG if the key is a raw key but the + * hardware does not support raw keys and blk-crypto-fallback is either + * disabled or the needed algorithm is disabled in the crypto API; or + * another -errno code if something else went wrong. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; + if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) { + pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev); + return -EOPNOTSUPP; + } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev, pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); + +static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_import_key_arg arg; + u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) + return -EINVAL; + + if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), + arg.raw_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(raw_key, sizeof(raw_key)); + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_generate_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + ret = blk_crypto_generate_key(profile, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_prepare_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.lt_key_size > sizeof(lt_key)) + return -EINVAL; + + if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), + arg.lt_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); + if (ret < 0) + goto out; + if (ret > arg.eph_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.eph_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, + arg.eph_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + memzero_explicit(eph_key, sizeof(eph_key)); + return ret; +} + +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + + if (!profile) + return -EOPNOTSUPP; + + switch (cmd) { + case BLKCRYPTOIMPORTKEY: + return blk_crypto_ioctl_import_key(profile, argp); + case BLKCRYPTOGENERATEKEY: + return blk_crypto_ioctl_generate_key(profile, argp); + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl_prepare_key(profile, argp); + default: + return -ENOTTY; + } +} diff --git a/block/blk-flush.c b/block/blk-flush.c index a72e2a83d075..43d6152897a4 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,9 +95,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +blk_get_flush_queue(struct blk_mq_ctx *ctx) { - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; + return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) @@ -205,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, struct list_head *running; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -341,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); @@ -382,7 +382,7 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ed11438eb4be..5bfd70311359 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2718,8 +2718,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ @@ -3223,14 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; - unsigned int memflags; + unsigned long memflags; int ret; blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto err; + } body = ctx.body; disk = ctx.bdev->bd_disk; @@ -3247,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3347,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); - blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); - ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 1d1589c35297..fdd4efb54c6c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -551,8 +551,8 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) { struct req_iterator iter = { .bio = rq->bio, diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index adf5f0697b6b..3421b5521fe2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -347,9 +347,14 @@ static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; + int res; + res = mutex_lock_interruptible(&hctx->queue->elevator_lock); + if (res) + return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms); + mutex_unlock(&hctx->queue->elevator_lock); return 0; } @@ -400,15 +405,14 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) @@ -417,15 +421,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) @@ -434,15 +437,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) @@ -451,15 +453,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_active_show(void *data, struct seq_file *m) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7442ca27c2bf..109611445d40 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -349,7 +349,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, } ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3feeeccf8a99..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index b9f417d980b4..d880c50629d6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -190,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, - data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 40490ac88045..ae8494d88897 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -508,7 +508,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* @@ -3314,6 +3314,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -4094,6 +4095,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&q->elevator_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4198,6 +4201,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + mutex_unlock(&q->elevator_lock); } /* @@ -4467,7 +4472,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4500,7 +4505,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4933,10 +4938,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, if (!qe) return false; - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + /* Accessing q->elevator needs protection from ->elevator_lock. */ + mutex_lock(&q->elevator_lock); - /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; @@ -4950,7 +4954,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); elevator_disable(q); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return true; } @@ -4980,11 +4984,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, diff --git a/block/blk-mq.h b/block/blk-mq.h index 44979e92b79f..3011a78cf16a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -100,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue - * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - blk_opf_t opf, +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d4d4f4dc0e23..95982bc46ba1 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) struct rq_qos_wait_data { struct wait_queue_entry wq; - struct task_struct *task; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; @@ -218,7 +217,20 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - wake_up_process(data->task); + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). + */ + default_wake_function(curr, mode, wake_flags, key); + /* + * Note that the order of operations is important as finish_wait() + * tests whether @curr is removed without grabbing the lock. This + * should be the last thing to do to make sure we will not have a + * UAF access to @data. And the semantics of memory barrier in it + * also make sure the waiter will see the latest @data->got_token + * once list_empty_careful() in finish_wait() returns true. + */ list_del_init_careful(&curr->entry); return 1; } @@ -244,41 +256,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, - .rqw = rqw, - .cb = acquire_inflight_cb, - .private_data = private_data, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + .got_token = false, }; - bool has_sleeper; + bool first_waiter; - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + /* + * If there are no waiters in the waiting queue, try to increase the + * inflight counter if we can. Otherwise, prepare for adding ourselves + * to the waiting queue. + */ + if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) return; - has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + init_wait_func(&data.wq, rq_qos_wake_function); + first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + /* + * Make sure there is at least one inflight process; otherwise, waiters + * will never be woken up. Since there may be no inflight process before + * adding ourselves to the waiting queue above, we need to try to + * increase the inflight counter for ourselves. And it is sufficient to + * guarantee that at least the first waiter to enter the waiting queue + * will re-check the waiting condition before going to sleep, thus + * ensuring forward progress. + */ + if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* + * We raced with rq_qos_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + * + * Enough memory barrier in list_empty_careful() in + * finish_wait() is paired with list_del_init_careful() + * in rq_qos_wake_function() to make sure we will see + * the latest @data->got_token. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + return; + } + + /* we are now relying on the waker to increase our inflight counter. */ do { - /* The memory barrier in set_current_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with rq_qos_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - if (data.got_token) - cleanup_cb(rqw, private_data); - return; - } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); diff --git a/block/blk-settings.c b/block/blk-settings.c index b9c6f0ec1c49..6b2dbe645d23 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -21,7 +21,7 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { - q->rq_timeout = timeout; + WRITE_ONCE(q->rq_timeout, timeout); } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); @@ -114,9 +114,15 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) pr_warn("invalid PI settings.\n"); return -EINVAL; } + bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY; return 0; } + if (lim->features & BLK_FEAT_BOUNCE_HIGH) { + pr_warn("no bounce buffer support for integrity metadata\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; @@ -867,36 +873,28 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) return true; - if (!ti->tuple_size) { - /* inherit the settings from the first underlying device */ - if (!(ti->flags & BLK_INTEGRITY_STACKED)) { - ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | - (bi->flags & BLK_INTEGRITY_REF_TAG); - ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; - ti->pi_offset = bi->pi_offset; - ti->interval_exp = bi->interval_exp; - ti->tag_size = bi->tag_size; - goto done; - } - if (!bi->tuple_size) - goto done; + if (ti->flags & BLK_INTEGRITY_STACKED) { + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + } else { + ti->flags = BLK_INTEGRITY_STACKED; + ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; } - - if (ti->tuple_size != bi->tuple_size) - goto incompatible; - if (ti->interval_exp != bi->interval_exp) - goto incompatible; - if (ti->tag_size != bi->tag_size) - goto incompatible; - if (ti->csum_type != bi->csum_type) - goto incompatible; - if ((ti->flags & BLK_INTEGRITY_REF_TAG) != - (bi->flags & BLK_INTEGRITY_REF_TAG)) - goto incompatible; - -done: - ti->flags |= BLK_INTEGRITY_STACKED; return true; incompatible: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6f548a4376aa..a2882751f0d2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -23,10 +23,11 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*show_limit)(struct gendisk *disk, char *page); + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); - void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t @@ -52,7 +53,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count) static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->queue->nr_requests, page); + ssize_t ret; + + mutex_lock(&disk->queue->elevator_lock); + ret = queue_var_show(disk->queue->nr_requests, page); + mutex_unlock(&disk->queue->elevator_lock); + return ret; } static ssize_t @@ -60,27 +66,38 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; + unsigned int memflags; + struct request_queue *q = disk->queue; - if (!queue_is_mq(disk->queue)) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) - return err; - + ret = err; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + ssize_t ret; + + mutex_lock(&disk->queue->limits_lock); + ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + mutex_unlock(&disk->queue->limits_lock); + + return ret; } static ssize_t @@ -88,11 +105,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; + unsigned int memflags; + struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); + memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); + blk_mq_unfreeze_queue(q, memflags); + return ret; } @@ -238,8 +266,9 @@ static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + return sysfs_emit(page, "%u\n", - !!(disk->queue->limits.features & BLK_FEAT_POLL)); + !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) @@ -286,17 +315,21 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; + unsigned int memflags; + struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); + memflags = blk_mq_freeze_queue(q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -316,11 +349,19 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; + unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; + /* + * Here we update two queue flags each using atomic bitops, although + * updating two flags isn't atomic it should be harmless as those flags + * are accessed individually using atomic test_bit operation. So we + * don't grab any lock while updating these flags. + */ + memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -331,6 +372,7 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } + blk_mq_unfreeze_queue(q, memflags); #endif return ret; } @@ -344,29 +386,43 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!(disk->queue->limits.features & BLK_FEAT_POLL)) - return -EINVAL; + unsigned int memflags; + ssize_t ret = count; + struct request_queue *q = disk->queue; + + memflags = blk_mq_freeze_queue(q); + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = -EINVAL; + goto out; + } + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); - return count; +out: + blk_mq_unfreeze_queue(q, memflags); + return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); + return sysfs_emit(page, "%u\n", + jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val; + unsigned int val, memflags; int err; + struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; - blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); + memflags = blk_mq_freeze_queue(q); + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_mq_unfreeze_queue(q, memflags); return count; } @@ -412,57 +468,55 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; -#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .store_limit = _prefix##_store, \ + .attr = { .name = _name, .mode = 0444 }, \ + .show_limit = _prefix##_show, \ } -#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .load_module = _prefix##_load_module, \ - .store = _prefix##_store, \ + .show_limit = _prefix##_show, \ + .store_limit = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); -QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); -QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); -QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler"); - -QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); -QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); -QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); -QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); -QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); - -QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); -QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); + +QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); + +QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); -QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, +QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); -QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); -QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); -QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); @@ -470,16 +524,16 @@ QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); -QUEUE_RO_ENTRY(queue_fua, "fua"); -QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); +QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); -QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); +QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = 0444 }, - .show = queue_logical_block_size_show, + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); @@ -503,14 +557,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(disk->queue)) - return -EINVAL; + ssize_t ret; + struct request_queue *q = disk->queue; - if (wbt_disabled(disk->queue)) - return sysfs_emit(page, "0\n"); + mutex_lock(&q->elevator_lock); + if (!wbt_rq_qos(q)) { + ret = -EINVAL; + goto out; + } + + if (wbt_disabled(q)) { + ret = sysfs_emit(page, "0\n"); + goto out; + } - return sysfs_emit(page, "%llu\n", - div_u64(wbt_get_min_lat(disk->queue), 1000)); + ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +out: + mutex_unlock(&q->elevator_lock); + return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, @@ -520,6 +584,7 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, struct rq_qos *rqos; ssize_t ret; s64 val; + unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -527,20 +592,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) - return ret; + goto out; } + ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) - return count; + goto out; /* * Ensure that the queue is idled, in case the latency update @@ -552,8 +621,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); +out: + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); - return count; + return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); @@ -561,7 +633,9 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { - &queue_ra_entry.attr, + /* + * Attributes which are protected with q->limits_lock. + */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, @@ -577,44 +651,58 @@ static struct attribute *queue_attrs[] = { &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, - &queue_discard_zeroes_data_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, - &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, - &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, - &queue_nomerges_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, - &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_poll_delay_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, + &queue_ra_entry.attr, + + /* + * Attributes which don't require locking. + */ + &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, + &queue_nr_zones_entry.attr, + &queue_nomerges_entry.attr, + &queue_poll_entry.attr, + &queue_poll_delay_entry.attr, + NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - &queue_requests_entry.attr, + /* + * Attributes which require some form of locking other than + * q->sysfs_lock. + */ &elv_iosched_entry.attr, - &queue_rq_affinity_entry.attr, - &queue_io_timeout_entry.attr, + &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which don't require locking. + */ + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, + NULL, }; @@ -664,14 +752,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - ssize_t res; - if (!entry->show) + if (!entry->show && !entry->show_limit) return -EIO; - mutex_lock(&disk->queue->sysfs_lock); - res = entry->show(disk, page); - mutex_unlock(&disk->queue->sysfs_lock); - return res; + + if (entry->show_limit) { + ssize_t res; + + mutex_lock(&disk->queue->limits_lock); + res = entry->show_limit(disk, page); + mutex_unlock(&disk->queue->limits_lock); + return res; + } + + return entry->show(disk, page); } static ssize_t @@ -681,21 +775,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - unsigned int memflags; - ssize_t res; if (!entry->store_limit && !entry->store) return -EIO; - /* - * If the attribute needs to load a module, do it before freezing the - * queue to ensure that the module file can be read when the request - * queue is the one for the device storing the module file. - */ - if (entry->load_module) - entry->load_module(disk, page, length); - if (entry->store_limit) { + ssize_t res; + struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); @@ -710,12 +796,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return length; } - mutex_lock(&q->sysfs_lock); - memflags = blk_mq_freeze_queue(q); - res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); - return res; + return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { @@ -784,18 +865,22 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_debugfs_remove; + ret = blk_crypto_sysfs_register(disk); + if (ret) + goto out_unregister_ia_ranges; + + mutex_lock(&q->elevator_lock); if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) - goto out_unregister_ia_ranges; + if (ret) { + mutex_unlock(&q->elevator_lock); + goto out_crypto_sysfs_unregister; + } } - - ret = blk_crypto_sysfs_register(disk); - if (ret) - goto out_elv_unregister; + wbt_enable_default(disk); + mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); @@ -817,8 +902,8 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_elv_unregister: - elv_unregister_queue(q); +out_crypto_sysfs_unregister: + blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -864,8 +949,11 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elv_unregister_queue(q); + mutex_unlock(&q->elevator_lock); + + mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8d149aff9fd0..91dab43c65ab 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -478,8 +478,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -498,16 +496,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, - bool clear_carryover) + bool clear) { - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; + if (clear) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - if (clear_carryover) { - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; - } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -599,29 +595,34 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); - if (!time_elapsed) + /* Don't trim slice until at least 2 slices are used */ + if (time_elapsed < tg->td->throtl_slice * 2) return; + /* + * The bio submission time may be a few jiffies more than the expected + * waiting time, due to 'extra_bytes' can't be divided in + * tg_within_bps_limit(), and also due to timer wakeup delay. In this + * case, adjust slice_start will discard the extra wait time, causing + * lower rate than expected. Therefore, other than the above rounddown, + * one extra slice is preserved for deviation. + */ + time_elapsed -= tg->td->throtl_slice; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed) + - tg->carryover_bytes[rw]; - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + - tg->carryover_ios[rw]; + time_elapsed); + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); if (bytes_trim <= 0 && io_trim <= 0) return; - tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else @@ -636,7 +637,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) jiffies); } -static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +static void __tg_update_carryover(struct throtl_grp *tg, bool rw, + long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); @@ -649,26 +651,28 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw) * configuration. */ if (bps_limit != U64_MAX) - tg->carryover_bytes[rw] += - calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) - tg->carryover_ios[rw] += - calculate_io_allowed(iops_limit, jiffy_elapsed) - + *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; + tg->bytes_disp[rw] -= *bytes; + tg->io_disp[rw] -= *ios; } static void tg_update_carryover(struct throtl_grp *tg) { + long long bytes[2] = {0}; + int ios[2] = {0}; + if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ); + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, - tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], - tg->carryover_ios[READ], tg->carryover_ios[WRITE]); + bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, @@ -686,8 +690,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + - tg->carryover_ios[rw]; + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; @@ -720,8 +723,7 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + - tg->carryover_bytes[rw]; + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; @@ -810,13 +812,10 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->bytes_disp[rw] += bio_size; - tg->last_bytes_disp[rw] += bio_size; - } tg->io_disp[rw]++; - tg->last_io_disp[rw]++; } /** @@ -1614,13 +1613,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) return tg_may_dispatch(tg, bio, NULL); } -static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) -{ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); - tg->carryover_ios[rw]--; -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1657,10 +1649,12 @@ bool __blk_throtl_bio(struct bio *bio) /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. - * Debts are handled by carryover_bytes/ios while - * calculating wait time. + * + * Charge and dispatch directly, and our throttle + * control algorithm is adaptive, and extra IO bytes + * will be throttled for paying the debt */ - tg_dispatch_in_debt(tg, bio, rw); + throtl_charge_bio(tg, bio); } else { /* if above limits, break to queue */ break; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 1a36d1278eea..7964cc041e06 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -102,12 +102,9 @@ struct throtl_grp { unsigned int iops[2]; /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; + int64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; + int io_disp[2]; /* * The following two fields are updated when new configuration is diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6dfc659d22e2..f1754d07f7e0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -136,8 +136,9 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; @@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; @@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) __wbt_done(rqos, flags); } -/* - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ +/* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); diff --git a/block/blk.h b/block/blk.h index 9cf9a0099416..006e3be433d2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -715,7 +715,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); -void blk_integrity_verify(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/bounce.c b/block/bounce.c index 0d898cd5ec49..09a9616cf209 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -41,8 +41,6 @@ static void init_bounce_bioset(void) ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 93523d8f8195..9ceb5d0832f5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; } diff --git a/block/elevator.c b/block/elevator.c index cd2ce4921601..b4d08026b02c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { @@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) unsigned int memflags; int ret; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q) { unsigned int memflags; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -700,34 +700,44 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -void elv_iosched_load_module(struct gendisk *disk, const char *buf, - size_t count) +static void elv_iosched_load_module(char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; - const char *name; - - strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); spin_lock(&elv_list_lock); - found = __elevator_find(name); + found = __elevator_find(elevator_name); spin_unlock(&elv_list_lock); if (!found) - request_module("%s-iosched", name); + request_module("%s-iosched", elevator_name); } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; + char *name; int ret; + unsigned int memflags; + struct request_queue *q = disk->queue; + /* + * If the attribute needs to load a module, do it before freezing the + * queue to ensure that the module file can be read when the request + * queue is the one for the device storing the module file. + */ strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(disk->queue, strstrip(elevator_name)); + name = strstrip(elevator_name); + + elv_iosched_load_module(name); + + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + ret = elevator_change(q, name); if (!ret) - return count; + ret = count; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -738,6 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; + mutex_lock(&q->elevator_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { @@ -755,6 +766,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); + mutex_unlock(&q->elevator_lock); + return len; } diff --git a/block/elevator.h b/block/elevator.h index e526662c5dbb..e4e44dfac503 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *); * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); -void elv_iosched_load_module(struct gendisk *disk, const char *page, - size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); diff --git a/block/genhd.c b/block/genhd.c index e9375e20d866..c2bd86cd09de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -565,8 +565,11 @@ out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: - if (disk->queue->elevator) + if (disk->queue->elevator) { + mutex_lock(&disk->queue->elevator_lock); elevator_exit(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); + } return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk) blk_mq_quiesce_queue(q); if (q->elevator) { - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_exit(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); diff --git a/block/ioctl.c b/block/ioctl.c index 6554b728bae6..faa40f383e27 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -15,6 +15,7 @@ #include <linux/io_uring/cmd.h> #include <uapi/linux/blkdev.h> #include "blk.h" +#include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) @@ -620,6 +621,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKCRYPTOIMPORTKEY: + case BLKCRYPTOGENERATEKEY: + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index dc31f2dfa414..0f0f8452609a 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 9cc6b8c1eea4..b5ecddd5181a 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state) p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index ddf9e6def4b2..2419af76120f 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 2577114ff20c..851db518ee5e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio) } } -void blk_integrity_verify(struct bio *bio) +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); @@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio) */ iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.seed = bip->bio_iter.bi_sector; + iter.seed = saved_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { void *kaddr = bvec_kmap_local(&bv); blk_status_t ret = BLK_STS_OK; |