summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 18:08:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 18:08:55 -0700
commit9b960d8cd6f712cb2c03e2bdd4d5ca058238037f (patch)
treea1381af6c79626c0a28679f804477b43b7c91565 /block
parent91928e0d3cc29789f4483bffee5f36218f23942b (diff)
parent3c9f0c9326b625bf008962d58996f89a3bba1e12 (diff)
Merge tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - Fixes for integrity handling - NVMe pull request via Keith: - Secure concatenation for TCP transport (Hannes) - Multipath sysfs visibility (Nilay) - Various cleanups (Qasim, Baruch, Wang, Chen, Mike, Damien, Li) - Correct use of 64-bit BARs for pci-epf target (Niklas) - Socket fix for selinux when used in containers (Peijie) - MD pull request via Yu: - fix recovery can preempt resync (Li Nan) - fix md-bitmap IO limit (Su Yue) - fix raid10 discard with REQ_NOWAIT (Xiao Ni) - fix raid1 memory leak (Zheng Qixing) - fix mddev uaf (Yu Kuai) - fix raid1,raid10 IO flags (Yu Kuai) - some refactor and cleanup (Yu Kuai) - Series cleaning up and fixing bugs in the bad block handling code - Improve support for write failure simulation in null_blk - Various lock ordering fixes - Fixes for locking for debugfs attributes - Various ublk related fixes and improvements - Cleanups for blk-rq-qos wait handling - blk-throttle fixes - Fixes for loop dio and sync handling - Fixes and cleanups for the auto-PI code - Block side support for hardware encryption keys in blk-crypto - Various cleanups and fixes * tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux: (105 commits) nvmet: replace max(a, min(b, c)) by clamp(val, lo, hi) nvme-tcp: fix selinux denied when calling sock_sendmsg nvmet: pci-epf: Always configure BAR0 as 64-bit nvmet: Remove duplicate uuid_copy nvme: zns: Simplify nvme_zone_parse_entry() nvmet: pci-epf: Remove redundant 'flush_workqueue()' calls nvmet-fc: Remove unused functions nvme-pci: remove stale comment nvme-fc: Utilise min3() to simplify queue count calculation nvme-multipath: Add visibility for queue-depth io-policy nvme-multipath: Add visibility for numa io-policy nvme-multipath: Add visibility for round-robin io-policy nvmet: add tls_concat and tls_key debugfs entries nvmet-tcp: support secure channel concatenation nvmet: Add 'sq' argument to alloc_ctrl_args nvme-fabrics: reset admin connection for secure concatenation nvme-tcp: request secure channel concatenation nvme-keyring: add nvme_tls_psk_refresh() nvme: add nvme_auth_derive_tls_psk() nvme: add nvme_auth_generate_digest() ...
Diffstat (limited to 'block')
-rw-r--r--block/Makefile3
-rw-r--r--block/badblocks.c327
-rw-r--r--block/bio-integrity-auto.c191
-rw-r--r--block/bio-integrity.c266
-rw-r--r--block/bio.c17
-rw-r--r--block/blk-cgroup.c73
-rw-r--r--block/blk-cgroup.h2
-rw-r--r--block/blk-core.c7
-rw-r--r--block/blk-crypto-fallback.c7
-rw-r--r--block/blk-crypto-internal.h10
-rw-r--r--block/blk-crypto-profile.c101
-rw-r--r--block/blk-crypto-sysfs.c35
-rw-r--r--block/blk-crypto.c204
-rw-r--r--block/blk-flush.c10
-rw-r--r--block/blk-iocost.c20
-rw-r--r--block/blk-merge.c4
-rw-r--r--block/blk-mq-debugfs.c41
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq-sysfs.c4
-rw-r--r--block/blk-mq-tag.c3
-rw-r--r--block/blk-mq.c22
-rw-r--r--block/blk-mq.h4
-rw-r--r--block/blk-rq-qos.c82
-rw-r--r--block/blk-settings.c58
-rw-r--r--block/blk-sysfs.c304
-rw-r--r--block/blk-throttle.c82
-rw-r--r--block/blk-throttle.h7
-rw-r--r--block/blk-wbt.c17
-rw-r--r--block/blk.h2
-rw-r--r--block/bounce.c2
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/elevator.c43
-rw-r--r--block/elevator.h2
-rw-r--r--block/genhd.c9
-rw-r--r--block/ioctl.c5
-rw-r--r--block/kyber-iosched.c2
-rw-r--r--block/partitions/sgi.c2
-rw-r--r--block/partitions/sun.c2
-rw-r--r--block/t10-pi.c6
39 files changed, 1180 insertions, 800 deletions
diff --git a/block/Makefile b/block/Makefile
index 33748123710b..3a941dc0d27f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -26,7 +26,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \
+ bio-integrity-auto.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
diff --git a/block/badblocks.c b/block/badblocks.c
index db4ec8b9b2a8..ece64e76fe8f 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -528,51 +528,6 @@ out:
}
/*
- * Return 'true' if the range indicated by 'bad' can be backward merged
- * with the bad range (from the bad table) index by 'behind'.
- */
-static bool can_merge_behind(struct badblocks *bb,
- struct badblocks_context *bad, int behind)
-{
- sector_t sectors = bad->len;
- sector_t s = bad->start;
- u64 *p = bb->page;
-
- if ((s < BB_OFFSET(p[behind])) &&
- ((s + sectors) >= BB_OFFSET(p[behind])) &&
- ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
- BB_ACK(p[behind]) == bad->ack)
- return true;
- return false;
-}
-
-/*
- * Do backward merge for range indicated by 'bad' and the bad range
- * (from the bad table) indexed by 'behind'. The return value is merged
- * sectors from bad->len.
- */
-static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
- int behind)
-{
- sector_t sectors = bad->len;
- sector_t s = bad->start;
- u64 *p = bb->page;
- int merged = 0;
-
- WARN_ON(s >= BB_OFFSET(p[behind]));
- WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
-
- if (s < BB_OFFSET(p[behind])) {
- merged = BB_OFFSET(p[behind]) - s;
- p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
-
- WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
- }
-
- return merged;
-}
-
-/*
* Return 'true' if the range indicated by 'bad' can be forward
* merged with the bad range (from the bad table) indexed by 'prev'.
*/
@@ -745,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev,
*extra = 2;
}
- if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+ if ((bb->count + (*extra)) > MAX_BADBLOCKS)
return false;
return true;
@@ -855,40 +810,60 @@ static void badblocks_update_acked(struct badblocks *bb)
bb->unacked_exist = 0;
}
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool try_adjacent_combine(struct badblocks *bb, int prev)
+{
+ u64 *p = bb->page;
+
+ if (prev >= 0 && (prev + 1) < bb->count &&
+ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+ BB_ACK(p[prev]));
+
+ if ((prev + 2) < bb->count)
+ memmove(p + prev + 1, p + prev + 2,
+ (bb->count - (prev + 2)) * 8);
+ bb->count--;
+ return true;
+ }
+ return false;
+}
+
/* Do exact work to set bad block range into the bad block table */
-static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,
+ int acknowledged)
{
- int retried = 0, space_desired = 0;
- int orig_len, len = 0, added = 0;
+ int len = 0, added = 0;
struct badblocks_context bad;
int prev = -1, hint = -1;
- sector_t orig_start;
unsigned long flags;
- int rv = 0;
u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
- return 1;
+ return false;
if (sectors == 0)
/* Invalid sectors number */
- return 1;
+ return false;
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- rounddown(s, bb->shift);
- roundup(next, bb->shift);
+ rounddown(s, 1 << bb->shift);
+ roundup(next, 1 << bb->shift);
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
- orig_start = s;
- orig_len = sectors;
bad.ack = acknowledged;
p = bb->page;
@@ -897,6 +872,9 @@ re_insert:
bad.len = sectors;
len = 0;
+ if (badblocks_full(bb))
+ goto out;
+
if (badblocks_empty(bb)) {
len = insert_at(bb, 0, &bad);
bb->count++;
@@ -908,32 +886,14 @@ re_insert:
/* start before all badblocks */
if (prev < 0) {
- if (!badblocks_full(bb)) {
- /* insert on the first */
- if (bad.len > (BB_OFFSET(p[0]) - bad.start))
- bad.len = BB_OFFSET(p[0]) - bad.start;
- len = insert_at(bb, 0, &bad);
- bb->count++;
- added++;
- hint = 0;
- goto update_sectors;
- }
-
- /* No sapce, try to merge */
- if (overlap_behind(bb, &bad, 0)) {
- if (can_merge_behind(bb, &bad, 0)) {
- len = behind_merge(bb, &bad, 0);
- added++;
- } else {
- len = BB_OFFSET(p[0]) - s;
- space_desired = 1;
- }
- hint = 0;
- goto update_sectors;
- }
-
- /* no table space and give up */
- goto out;
+ /* insert on the first */
+ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+ bad.len = BB_OFFSET(p[0]) - bad.start;
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ hint = ++prev;
+ goto update_sectors;
}
/* in case p[prev-1] can be merged with p[prev] */
@@ -945,33 +905,6 @@ re_insert:
goto update_sectors;
}
- if (overlap_front(bb, prev, &bad)) {
- if (can_merge_front(bb, prev, &bad)) {
- len = front_merge(bb, prev, &bad);
- added++;
- } else {
- int extra = 0;
-
- if (!can_front_overwrite(bb, prev, &bad, &extra)) {
- len = min_t(sector_t,
- BB_END(p[prev]) - s, sectors);
- hint = prev;
- goto update_sectors;
- }
-
- len = front_overwrite(bb, prev, &bad, extra);
- added++;
- bb->count += extra;
-
- if (can_combine_front(bb, prev, &bad)) {
- front_combine(bb, prev);
- bb->count--;
- }
- }
- hint = prev;
- goto update_sectors;
- }
-
if (can_merge_front(bb, prev, &bad)) {
len = front_merge(bb, prev, &bad);
added++;
@@ -979,21 +912,29 @@ re_insert:
goto update_sectors;
}
- /* if no space in table, still try to merge in the covered range */
- if (badblocks_full(bb)) {
- /* skip the cannot-merge range */
- if (((prev + 1) < bb->count) &&
- overlap_behind(bb, &bad, prev + 1) &&
- ((s + sectors) >= BB_END(p[prev + 1]))) {
- len = BB_END(p[prev + 1]) - s;
- hint = prev + 1;
+ if (overlap_front(bb, prev, &bad)) {
+ int extra = 0;
+
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ if (extra > 0)
+ goto out;
+
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
goto update_sectors;
}
- /* no retry any more */
- len = sectors;
- space_desired = 1;
- hint = -1;
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
+
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
+ }
+
+ hint = prev;
goto update_sectors;
}
@@ -1006,7 +947,7 @@ re_insert:
len = insert_at(bb, prev + 1, &bad);
bb->count++;
added++;
- hint = prev + 1;
+ hint = ++prev;
update_sectors:
s += len;
@@ -1015,35 +956,12 @@ update_sectors:
if (sectors > 0)
goto re_insert;
- WARN_ON(sectors < 0);
-
/*
* Check whether the following already set range can be
* merged. (prev < 0) condition is not handled here,
* because it's already complicated enough.
*/
- if (prev >= 0 &&
- (prev + 1) < bb->count &&
- BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
- (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
- BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
- p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
- BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
- BB_ACK(p[prev]));
-
- if ((prev + 2) < bb->count)
- memmove(p + prev + 1, p + prev + 2,
- (bb->count - (prev + 2)) * 8);
- bb->count--;
- }
-
- if (space_desired && !badblocks_full(bb)) {
- s = orig_start;
- sectors = orig_len;
- space_desired = 0;
- if (retried++ < 3)
- goto re_insert;
- }
+ try_adjacent_combine(bb, prev);
out:
if (added) {
@@ -1057,10 +975,7 @@ out:
write_sequnlock_irqrestore(&bb->lock, flags);
- if (!added)
- rv = 1;
-
- return rv;
+ return sectors == 0;
}
/*
@@ -1131,21 +1046,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev,
}
/* Do the exact work to clear bad block range from the bad block table */
-static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
{
struct badblocks_context bad;
int prev = -1, hint = -1;
int len = 0, cleared = 0;
- int rv = 0;
u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
- return 1;
+ return false;
if (sectors == 0)
/* Invalid sectors number */
- return 1;
+ return false;
if (bb->shift) {
sector_t target;
@@ -1157,8 +1071,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
* isn't than to think a block is not bad when it is.
*/
target = s + sectors;
- roundup(s, bb->shift);
- rounddown(target, bb->shift);
+ roundup(s, 1 << bb->shift);
+ rounddown(target, 1 << bb->shift);
sectors = target - s;
}
@@ -1214,7 +1128,7 @@ re_clear:
if ((BB_OFFSET(p[prev]) < bad.start) &&
(BB_END(p[prev]) > (bad.start + bad.len))) {
/* Splitting */
- if ((bb->count + 1) < MAX_BADBLOCKS) {
+ if ((bb->count + 1) <= MAX_BADBLOCKS) {
len = front_splitting_clear(bb, prev, &bad);
bb->count += 1;
cleared++;
@@ -1255,8 +1169,6 @@ update_sectors:
if (sectors > 0)
goto re_clear;
- WARN_ON(sectors < 0);
-
if (cleared) {
badblocks_update_acked(bb);
set_changed(bb);
@@ -1265,40 +1177,21 @@ update_sectors:
write_sequnlock_irq(&bb->lock);
if (!cleared)
- rv = 1;
+ return false;
- return rv;
+ return true;
}
/* Do the exact work to check bad blocks range from the bad block table */
-static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
- int unacked_badblocks, acked_badblocks;
int prev = -1, hint = -1, set = 0;
struct badblocks_context bad;
- unsigned int seq;
+ int unacked_badblocks = 0;
+ int acked_badblocks = 0;
+ u64 *p = bb->page;
int len, rv;
- u64 *p;
-
- WARN_ON(bb->shift < 0 || sectors == 0);
-
- if (bb->shift > 0) {
- sector_t target;
-
- /* round the start down, and the end up */
- target = s + sectors;
- rounddown(s, bb->shift);
- roundup(target, bb->shift);
- sectors = target - s;
- }
-
-retry:
- seq = read_seqbegin(&bb->lock);
-
- p = bb->page;
- unacked_badblocks = 0;
- acked_badblocks = 0;
re_check:
bad.start = s;
@@ -1349,14 +1242,15 @@ re_check:
len = sectors;
update_sectors:
+ /* This situation should never happen */
+ WARN_ON(sectors < len);
+
s += len;
sectors -= len;
if (sectors > 0)
goto re_check;
- WARN_ON(sectors < 0);
-
if (unacked_badblocks > 0)
rv = -1;
else if (acked_badblocks > 0)
@@ -1364,9 +1258,6 @@ update_sectors:
else
rv = 0;
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
return rv;
}
@@ -1404,10 +1295,30 @@ update_sectors:
* -1: there are bad blocks which have not yet been acknowledged in metadata.
* plus the start/length of the first bad section we overlap.
*/
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
- return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ unsigned int seq;
+ int rv;
+
+ WARN_ON(bb->shift < 0 || sectors == 0);
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ sector_t target = s + sectors;
+
+ rounddown(s, 1 << bb->shift);
+ roundup(target, 1 << bb->shift);
+ sectors = target - s;
+ }
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
}
EXPORT_SYMBOL_GPL(badblocks_check);
@@ -1423,11 +1334,12 @@ EXPORT_SYMBOL_GPL(badblocks_check);
* decide how best to handle it.
*
* Return:
- * 0: success
- * 1: failed to set badblocks (out of space)
+ * true: success
+ * false: failed to set badblocks (out of space). Parital setting will be
+ * treated as failure.
*/
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,
+ int acknowledged)
{
return _badblocks_set(bb, s, sectors, acknowledged);
}
@@ -1444,10 +1356,10 @@ EXPORT_SYMBOL_GPL(badblocks_set);
* drop the remove request.
*
* Return:
- * 0: success
- * 1: failed to clear badblocks
+ * true: success
+ * false: failed to clear badblocks
*/
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
{
return _badblocks_clear(bb, s, sectors);
}
@@ -1479,6 +1391,11 @@ void ack_all_badblocks(struct badblocks *bb)
p[i] = BB_MAKE(start, len, 1);
}
}
+
+ for (i = 0; i < bb->count ; i++)
+ while (try_adjacent_combine(bb, i))
+ ;
+
bb->unacked_exist = 0;
}
write_sequnlock_irq(&bb->lock);
@@ -1564,10 +1481,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
return -EINVAL;
}
- if (badblocks_set(bb, sector, length, !unack))
+ if (!badblocks_set(bb, sector, length, !unack))
return -ENOSPC;
- else
- return len;
+
+ return len;
}
EXPORT_SYMBOL_GPL(badblocks_store);
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
new file mode 100644
index 000000000000..e524c609be50
--- /dev/null
+++ b/block/bio-integrity-auto.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * Automatically generate and verify integrity data on PI capable devices if the
+ * bio submitter didn't provide PI itself. This ensures that kernel verifies
+ * data integrity even if the file system (or other user of the block device) is
+ * not aware of PI.
+ */
+#include <linux/blk-integrity.h>
+#include <linux/workqueue.h>
+#include "blk.h"
+
+struct bio_integrity_data {
+ struct bio *bio;
+ struct bvec_iter saved_bio_iter;
+ struct work_struct work;
+ struct bio_integrity_payload bip;
+ struct bio_vec bvec;
+};
+
+static struct kmem_cache *bid_slab;
+static mempool_t bid_pool;
+static struct workqueue_struct *kintegrityd_wq;
+
+static void bio_integrity_finish(struct bio_integrity_data *bid)
+{
+ bid->bio->bi_integrity = NULL;
+ bid->bio->bi_opf &= ~REQ_INTEGRITY;
+ kfree(bvec_virt(bid->bip.bip_vec));
+ mempool_free(bid, &bid_pool);
+}
+
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_data *bid =
+ container_of(work, struct bio_integrity_data, work);
+ struct bio *bio = bid->bio;
+
+ blk_integrity_verify_iter(bio, &bid->saved_bio_iter);
+ bio_integrity_finish(bid);
+ bio_endio(bio);
+}
+
+/**
+ * __bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ *
+ * Normally I/O completion is done in interrupt context. However, verifying I/O
+ * integrity is a time-consuming task which must be run in process context.
+ *
+ * This function postpones completion accordingly.
+ */
+bool __bio_integrity_endio(struct bio *bio)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct bio_integrity_data *bid =
+ container_of(bip, struct bio_integrity_data, bip);
+
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
+ INIT_WORK(&bid->work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bid->work);
+ return false;
+ }
+
+ bio_integrity_finish(bid);
+ return true;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Checks if the bio already has an integrity payload attached. If it does, the
+ * payload has been generated by another kernel subsystem, and we just pass it
+ * through.
+ * Otherwise allocates integrity payload and for writes the integrity metadata
+ * will be generated. For reads, the completion handler will verify the
+ * metadata.
+ */
+bool bio_integrity_prep(struct bio *bio)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_data *bid;
+ gfp_t gfp = GFP_NOIO;
+ unsigned int len;
+ void *buf;
+
+ if (!bi)
+ return true;
+
+ if (!bio_sectors(bio))
+ return true;
+
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return true;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ if (bi->flags & BLK_INTEGRITY_NOVERIFY)
+ return true;
+ break;
+ case REQ_OP_WRITE:
+ if (bi->flags & BLK_INTEGRITY_NOGENERATE)
+ return true;
+
+ /*
+ * Zero the memory allocated to not leak uninitialized kernel
+ * memory to disk for non-integrity metadata where nothing else
+ * initializes the memory.
+ */
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+ gfp |= __GFP_ZERO;
+ break;
+ default:
+ return true;
+ }
+
+ if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
+ return true;
+
+ /* Allocate kernel buffer for protection data */
+ len = bio_integrity_bytes(bi, bio_sectors(bio));
+ buf = kmalloc(len, gfp);
+ if (!buf)
+ goto err_end_io;
+ bid = mempool_alloc(&bid_pool, GFP_NOIO);
+ if (!bid)
+ goto err_free_buf;
+ bio_integrity_init(bio, &bid->bip, &bid->bvec, 1);
+
+ bid->bio = bio;
+
+ bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
+ bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
+
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
+ bid->bip.bip_flags |= BIP_IP_CHECKSUM;
+ if (bi->csum_type)
+ bid->bip.bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+
+ if (bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf)) < len)
+ goto err_end_io;
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE)
+ blk_integrity_generate(bio);
+ else
+ bid->saved_bio_iter = bio->bi_iter;
+ return true;
+
+err_free_buf:
+ kfree(buf);
+err_end_io:
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return false;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+void blk_flush_integrity(void)
+{
+ flush_workqueue(kintegrityd_wq);
+}
+
+static int __init blk_integrity_auto_init(void)
+{
+ bid_slab = kmem_cache_create("bio_integrity_data",
+ sizeof(struct bio_integrity_data), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+ if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab))
+ panic("bio: can't create integrity pool\n");
+
+ /*
+ * kintegrityd won't block much but may burn a lot of CPU cycles.
+ * Make it highpri CPU intensive wq with max concurrency of 1.
+ */
+ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+ return 0;
+}
+subsys_initcall(blk_integrity_auto_init);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5d81ad9a3d20..608594a154a5 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -7,20 +7,12 @@
*/
#include <linux/blk-integrity.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
#include "blk.h"
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
-
-void blk_flush_integrity(void)
-{
- flush_workqueue(kintegrityd_wq);
-}
+struct bio_integrity_alloc {
+ struct bio_integrity_payload bip;
+ struct bio_vec bvecs[];
+};
/**
* bio_integrity_free - Free bio integrity payload
@@ -30,21 +22,23 @@ void blk_flush_integrity(void)
*/
void bio_integrity_free(struct bio *bio)
{
- struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_set *bs = bio->bi_pool;
-
- if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
- if (bip->bip_vec)
- bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_max_vcnt);
- mempool_free(bip, &bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
+ kfree(bio_integrity(bio));
bio->bi_integrity = NULL;
bio->bi_opf &= ~REQ_INTEGRITY;
}
+void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip,
+ struct bio_vec *bvecs, unsigned int nr_vecs)
+{
+ memset(bip, 0, sizeof(*bip));
+ bip->bip_max_vcnt = nr_vecs;
+ if (nr_vecs)
+ bip->bip_vec = bvecs;
+
+ bio->bi_integrity = bip;
+ bio->bi_opf |= REQ_INTEGRITY;
+}
+
/**
* bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
@@ -59,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
gfp_t gfp_mask,
unsigned int nr_vecs)
{
- struct bio_integrity_payload *bip;
- struct bio_set *bs = bio->bi_pool;
- unsigned inline_vecs;
+ struct bio_integrity_alloc *bia;
if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
return ERR_PTR(-EOPNOTSUPP);
- if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
- bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
- inline_vecs = nr_vecs;
- } else {
- bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIO_INLINE_VECS;
- }
-
- if (unlikely(!bip))
+ bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask);
+ if (unlikely(!bia))
return ERR_PTR(-ENOMEM);
-
- memset(bip, 0, sizeof(*bip));
-
- /* always report as many vecs as asked explicitly, not inline vecs */
- bip->bip_max_vcnt = nr_vecs;
- if (nr_vecs > inline_vecs) {
- bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
- &bip->bip_max_vcnt, gfp_mask);
- if (!bip->bip_vec)
- goto err;
- } else if (nr_vecs) {
- bip->bip_vec = bip->bip_inline_vecs;
- }
-
- bip->bip_bio = bio;
- bio->bi_integrity = bip;
- bio->bi_opf |= REQ_INTEGRITY;
-
- return bip;
-err:
- if (bs && mempool_initialized(&bs->bio_integrity_pool))
- mempool_free(bip, &bs->bio_integrity_pool);
- else
- kfree(bip);
- return ERR_PTR(-ENOMEM);
+ bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs);
+ return &bia->bip;
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -414,149 +376,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
}
/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio: bio to prepare
- *
- * Description: Checks if the bio already has an integrity payload attached.
- * If it does, the payload has been generated by another kernel subsystem,
- * and we just pass it through. Otherwise allocates integrity payload.
- * The bio must have data direction, target device and start sector set priot
- * to calling. In the WRITE case, integrity metadata will be generated using
- * the block device's integrity function. In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-bool bio_integrity_prep(struct bio *bio)
-{
- struct bio_integrity_payload *bip;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- unsigned int len;
- void *buf;
- gfp_t gfp = GFP_NOIO;
-
- if (!bi)
- return true;
-
- if (!bio_sectors(bio))
- return true;
-
- /* Already protected? */
- if (bio_integrity(bio))
- return true;
-
- switch (bio_op(bio)) {
- case REQ_OP_READ:
- if (bi->flags & BLK_INTEGRITY_NOVERIFY)
- return true;
- break;
- case REQ_OP_WRITE:
- if (bi->flags & BLK_INTEGRITY_NOGENERATE)
- return true;
-
- /*
- * Zero the memory allocated to not leak uninitialized kernel
- * memory to disk for non-integrity metadata where nothing else
- * initializes the memory.
- */
- if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
- gfp |= __GFP_ZERO;
- break;
- default:
- return true;
- }
-
- /* Allocate kernel buffer for protection data */
- len = bio_integrity_bytes(bi, bio_sectors(bio));
- buf = kmalloc(len, gfp);
- if (unlikely(buf == NULL)) {
- goto err_end_io;
- }
-
- bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
- if (IS_ERR(bip)) {
- kfree(buf);
- goto err_end_io;
- }
-
- bip->bip_flags |= BIP_BLOCK_INTEGRITY;
- bip_set_seed(bip, bio->bi_iter.bi_sector);
-
- if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
- bip->bip_flags |= BIP_IP_CHECKSUM;
-
- /* describe what tags to check in payload */
- if (bi->csum_type)
- bip->bip_flags |= BIP_CHECK_GUARD;
- if (bi->flags & BLK_INTEGRITY_REF_TAG)
- bip->bip_flags |= BIP_CHECK_REFTAG;
- if (bio_integrity_add_page(bio, virt_to_page(buf), len,
- offset_in_page(buf)) < len) {
- printk(KERN_ERR "could not attach integrity payload\n");
- goto err_end_io;
- }
-
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
- blk_integrity_generate(bio);
- else
- bip->bio_iter = bio->bi_iter;
- return true;
-
-err_end_io:
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return false;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work: Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request. The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
- struct bio_integrity_payload *bip =
- container_of(work, struct bio_integrity_payload, bip_work);
- struct bio *bio = bip->bip_bio;
-
- blk_integrity_verify(bio);
-
- kfree(bvec_virt(bip->bip_vec));
- bio_integrity_free(bio);
- bio_endio(bio);
-}
-
-/**
- * __bio_integrity_endio - Integrity I/O completion function
- * @bio: Protected bio
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context. However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context. This function postpones completion
- * accordingly.
- */
-bool __bio_integrity_endio(struct bio *bio)
-{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- struct bio_integrity_payload *bip = bio_integrity(bio);
-
- if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
- return false;
- }
-
- kfree(bvec_virt(bip->bip_vec));
- bio_integrity_free(bio);
- return true;
-}
-
-/**
* bio_integrity_advance - Advance integrity vector
* @bio: bio whose integrity vector to update
* @bytes_done: number of data bytes that have been completed
@@ -617,44 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
- if (mempool_initialized(&bs->bio_integrity_pool))
- return 0;
-
- if (mempool_init_slab_pool(&bs->bio_integrity_pool,
- pool_size, bip_slab))
- return -1;
-
- if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
- mempool_exit(&bs->bio_integrity_pool);
- return -1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
- mempool_exit(&bs->bio_integrity_pool);
- mempool_exit(&bs->bvec_integrity_pool);
-}
-
-void __init bio_integrity_init(void)
-{
- /*
- * kintegrityd won't block much but may burn a lot of CPU cycles.
- * Make it highpri CPU intensive wq with max concurrency of 1.
- */
- kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
- if (!kintegrityd_wq)
- panic("Failed to create kintegrityd\n");
-
- bip_slab = kmem_cache_create("bio_integrity_payload",
- sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIO_INLINE_VECS,
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-}
diff --git a/block/bio.c b/block/bio.c
index 6ac5983ba51e..4e6c85a33d74 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1026,9 +1026,10 @@ EXPORT_SYMBOL(bio_add_page);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
+ unsigned long nr = off / PAGE_SIZE;
+
WARN_ON_ONCE(len > UINT_MAX);
- WARN_ON_ONCE(off > UINT_MAX);
- __bio_add_page(bio, &folio->page, len, off);
+ __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
@@ -1049,9 +1050,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
- if (len > UINT_MAX || off > UINT_MAX)
+ unsigned long nr = off / PAGE_SIZE;
+
+ if (len > UINT_MAX)
return false;
- return bio_add_page(bio, &folio->page, len, off) > 0;
+ return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
}
EXPORT_SYMBOL(bio_add_folio);
@@ -1657,7 +1660,6 @@ void bioset_exit(struct bio_set *bs)
mempool_exit(&bs->bio_pool);
mempool_exit(&bs->bvec_pool);
- bioset_integrity_free(bs);
if (bs->bio_slab)
bio_put_slab(bs);
bs->bio_slab = NULL;
@@ -1737,8 +1739,6 @@ static int __init init_bio(void)
BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
- bio_integrity_init();
-
for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
struct biovec_slab *bvs = bvec_slabs + i;
@@ -1754,9 +1754,6 @@ static int __init init_bio(void)
BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
- if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
- panic("bio: can't create integrity pool\n");
-
return 0;
}
subsys_initcall(init_bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1994a11ff903..5905f277057b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -816,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
+/*
+ * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
+ * acquires q->elevator_lock, and ensures the correct locking order
+ * between q->elevator_lock and q->rq_qos_mutex.
+ *
+ * This function returns negative error on failure. On success it returns
+ * memflags which must be saved and later passed to blkg_conf_exit_frozen
+ * for restoring the memalloc scope.
+ */
+unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
+{
+ int ret;
+ unsigned long memflags;
+
+ if (ctx->bdev)
+ return -EINVAL;
+
+ ret = blkg_conf_open_bdev(ctx);
+ if (ret < 0)
+ return ret;
+ /*
+ * At this point, we haven’t started protecting anything related to QoS,
+ * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+ * the queue and acquiring q->elevator_lock to maintain the correct
+ * locking order.
+ */
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
+ mutex_lock(&ctx->bdev->bd_queue->elevator_lock);
+ mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ return memflags;
+}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
@@ -972,6 +1007,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);
+/*
+ * Similar to blkg_conf_exit, but also unfreezes the queue and releases
+ * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen
+ * is used to open the bdev.
+ */
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
+{
+ if (ctx->bdev) {
+ struct request_queue *q = ctx->bdev->bd_queue;
+
+ blkg_conf_exit(ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ }
+}
+
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
int i;
@@ -1728,27 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
+ /*
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
+ * without pd_alloc_fn/pd_free_fn can't be activated.
+ */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ return -EINVAL;
+
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
/* find an empty slot */
- ret = -ENOSPC;
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (!blkcg_policy[i])
break;
if (i >= BLKCG_MAX_POLS) {
pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
+ ret = -ENOSPC;
goto err_unlock;
}
- /*
- * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
- * without pd_alloc_fn/pd_free_fn can't be activated.
- */
- if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
- (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
- goto err_unlock;
-
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1759,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg_policy_data *cpd;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd)
+ if (!cpd) {
+ ret = -ENOMEM;
goto err_free_cpds;
+ }
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2c4663bd993a..81868ad86330 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -219,9 +219,11 @@ struct blkg_conf_ctx {
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-core.c b/block/blk-core.c
index d6c4fa3943b5..4623de79effa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
+ mutex_init(&q->elevator_lock);
mutex_init(&q->sysfs_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
@@ -455,6 +456,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
&q->q_lock_cls_key, 0);
+ /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */
+ fs_reclaim_acquire(GFP_KERNEL);
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+ fs_reclaim_release(GFP_KERNEL);
+
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 29a205482617..f154be0b575a 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split;
* This is the key we set when evicting a keyslot. This *should* be the all 0's
* key, but AES-XTS rejects that key, so we use some random bytes instead.
*/
-static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
+static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
@@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
- err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes,
key->size);
if (err) {
blk_crypto_fallback_evict_keyslot(slot);
@@ -539,7 +539,7 @@ static int blk_crypto_fallback_init(void)
if (blk_crypto_fallback_inited)
return 0;
- get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+ get_random_bytes(blank_key, sizeof(blank_key));
err = bioset_init(&crypto_bio_split, 64, 0, 0);
if (err)
@@ -561,6 +561,7 @@ static int blk_crypto_fallback_init(void)
blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops;
blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+ blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 93a141979694..ccf6dff6ff6b 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -14,6 +14,7 @@ struct blk_crypto_mode {
const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
+ unsigned int security_strength; /* security strength in bytes */
unsigned int ivsize; /* iv size in bytes */
};
@@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
const struct blk_crypto_config *cfg);
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
@@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
return false;
}
+static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ return -ENOTTY;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 7fabc883e39f..94a155912bf1 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
return false;
if (profile->max_dun_bytes_supported < cfg->dun_bytes)
return false;
+ if (!(profile->key_types_supported & cfg->key_type))
+ return false;
return true;
}
@@ -463,6 +465,99 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
EXPORT_SYMBOL_GPL(blk_crypto_register);
/**
+ * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key
+ * @bdev: a block device that supports hardware-wrapped keys
+ * @eph_key: a hardware-wrapped key in ephemerally-wrapped form
+ * @eph_key_size: size of @eph_key in bytes
+ * @sw_secret: (output) the software secret
+ *
+ * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that
+ * it is used for I/O), ask the hardware to derive the secret which software can
+ * use for cryptographic tasks other than inline encryption. This secret is
+ * guaranteed to be cryptographically isolated from the inline encryption key,
+ * i.e. derived with a different KDF context.
+ *
+ * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support
+ * hardware-wrapped keys, -EBADMSG if the key isn't a valid
+ * ephemerally-wrapped key, or another -errno code.
+ */
+int blk_crypto_derive_sw_secret(struct block_device *bdev,
+ const u8 *eph_key, size_t eph_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+ int err;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.derive_sw_secret)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size,
+ sw_secret);
+ blk_crypto_hw_exit(profile);
+ return err;
+}
+
+int blk_crypto_import_key(struct blk_crypto_profile *profile,
+ const u8 *raw_key, size_t raw_key_size,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.import_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size,
+ lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+int blk_crypto_generate_key(struct blk_crypto_profile *profile,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.generate_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.generate_key(profile, lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+int blk_crypto_prepare_key(struct blk_crypto_profile *profile,
+ const u8 *lt_key, size_t lt_key_size,
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.prepare_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size,
+ eph_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+/**
* blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
* by child device
* @parent: the crypto profile for the parent device
@@ -485,10 +580,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
child->max_dun_bytes_supported);
for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
parent->modes_supported[i] &= child->modes_supported[i];
+ parent->key_types_supported &= child->key_types_supported;
} else {
parent->max_dun_bytes_supported = 0;
memset(parent->modes_supported, 0,
sizeof(parent->modes_supported));
+ parent->key_types_supported = 0;
}
}
EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
@@ -521,6 +618,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
target->max_dun_bytes_supported)
return false;
+ if (reference->key_types_supported & ~target->key_types_supported)
+ return false;
+
return true;
}
EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
@@ -555,5 +655,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
sizeof(dst->modes_supported));
dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+ dst->key_types_supported = src->key_types_supported;
}
EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index a304434489ba..e832f403f200 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
return container_of(attr, struct blk_crypto_attr, attr);
}
+static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
{
@@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
return sysfs_emit(page, "%u\n", profile->num_slots);
}
+static ssize_t raw_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
#define BLK_CRYPTO_RO_ATTR(_name) \
static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+BLK_CRYPTO_RO_ATTR(hw_wrapped_keys);
BLK_CRYPTO_RO_ATTR(max_dun_bits);
BLK_CRYPTO_RO_ATTR(num_keyslots);
+BLK_CRYPTO_RO_ATTR(raw_keys);
+
+static umode_t blk_crypto_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ if (a == &hw_wrapped_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return 0;
+ if (a == &raw_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW))
+ return 0;
+
+ return 0444;
+}
static struct attribute *blk_crypto_attrs[] = {
+ &hw_wrapped_keys_attr.attr,
&max_dun_bits_attr.attr,
&num_keyslots_attr.attr,
+ &raw_keys_attr.attr,
NULL,
};
static const struct attribute_group blk_crypto_attr_group = {
.attrs = blk_crypto_attrs,
+ .is_visible = blk_crypto_is_visible,
};
/*
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 4d760b092deb..4b1ad84d1b5a 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = {
.name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
.name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
.name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
},
[BLK_ENCRYPTION_MODE_SM4_XTS] = {
.name = "SM4-XTS",
.cipher_str = "xts(sm4)",
.keysize = 32,
+ .security_strength = 16,
.ivsize = 16,
},
};
@@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void)
/* This is assumed in various places. */
BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
- /* Sanity check that no algorithm exceeds the defined limits. */
+ /*
+ * Validate the crypto mode properties. This ideally would be done with
+ * static assertions, but boot-time checks are the next best thing.
+ */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
- BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].keysize >
+ BLK_CRYPTO_MAX_RAW_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].security_strength >
+ blk_crypto_modes[i].keysize);
BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
}
@@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
/**
* blk_crypto_init_key() - Prepare a key for use with blk-crypto
* @blk_key: Pointer to the blk_crypto_key to initialize.
- * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
- * @crypto_mode; see blk_crypto_modes[].
+ * @key_bytes: the bytes of the key
+ * @key_size: size of the key in bytes
+ * @key_type: type of the key -- either raw or hardware-wrapped
* @crypto_mode: identifier for the encryption algorithm to use
* @dun_bytes: number of bytes that will be used to specify the DUN when this
* key is used
* @data_unit_size: the data unit size to use for en/decryption
*
* Return: 0 on success, -errno on failure. The caller is responsible for
- * zeroizing both blk_key and raw_key when done with them.
+ * zeroizing both blk_key and key_bytes when done with them.
*/
-int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
+int blk_crypto_init_key(struct blk_crypto_key *blk_key,
+ const u8 *key_bytes, size_t key_size,
+ enum blk_crypto_key_type key_type,
enum blk_crypto_mode_num crypto_mode,
unsigned int dun_bytes,
unsigned int data_unit_size)
@@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return -EINVAL;
mode = &blk_crypto_modes[crypto_mode];
- if (mode->keysize == 0)
+ switch (key_type) {
+ case BLK_CRYPTO_KEY_TYPE_RAW:
+ if (key_size != mode->keysize)
+ return -EINVAL;
+ break;
+ case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED:
+ if (key_size < mode->security_strength ||
+ key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)
+ return -EINVAL;
+ break;
+ default:
return -EINVAL;
+ }
if (dun_bytes == 0 || dun_bytes > mode->ivsize)
return -EINVAL;
@@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
blk_key->crypto_cfg.crypto_mode = crypto_mode;
blk_key->crypto_cfg.dun_bytes = dun_bytes;
blk_key->crypto_cfg.data_unit_size = data_unit_size;
+ blk_key->crypto_cfg.key_type = key_type;
blk_key->data_unit_size_bits = ilog2(data_unit_size);
- blk_key->size = mode->keysize;
- memcpy(blk_key->raw, raw_key, mode->keysize);
+ blk_key->size = key_size;
+ memcpy(blk_key->bytes, key_bytes, key_size);
return 0;
}
@@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev,
bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
- return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- blk_crypto_config_supported_natively(bdev, cfg);
+ if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
+ cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW)
+ return true;
+ return blk_crypto_config_supported_natively(bdev, cfg);
}
/**
@@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev,
* an skcipher, and *should not* be called from the data path, since that might
* cause a deadlock
*
- * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
- * blk-crypto-fallback is either disabled or the needed algorithm
- * is disabled in the crypto API; or another -errno code.
+ * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does
+ * not support wrapped keys; -ENOPKG if the key is a raw key but the
+ * hardware does not support raw keys and blk-crypto-fallback is either
+ * disabled or the needed algorithm is disabled in the crypto API; or
+ * another -errno code if something else went wrong.
*/
int blk_crypto_start_using_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
+ if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) {
+ pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev);
+ return -EOPNOTSUPP;
+ }
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev,
pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
+
+static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_import_key_arg arg;
+ u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key))
+ return -EINVAL;
+
+ if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr),
+ arg.raw_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(raw_key, sizeof(raw_key));
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_generate_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ ret = blk_crypto_generate_key(profile, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_prepare_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.lt_key_size > sizeof(lt_key))
+ return -EINVAL;
+
+ if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr),
+ arg.lt_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.eph_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.eph_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key,
+ arg.eph_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ memzero_explicit(eph_key, sizeof(eph_key));
+ return ret;
+}
+
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+
+ switch (cmd) {
+ case BLKCRYPTOIMPORTKEY:
+ return blk_crypto_ioctl_import_key(profile, argp);
+ case BLKCRYPTOGENERATEKEY:
+ return blk_crypto_ioctl_generate_key(profile, argp);
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl_prepare_key(profile, argp);
+ default:
+ return -ENOTTY;
+ }
+}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a72e2a83d075..43d6152897a4 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,9 +95,9 @@ static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, blk_opf_t flags);
static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+blk_get_flush_queue(struct blk_mq_ctx *ctx)
{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+ return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq;
}
static unsigned int blk_flush_cur_seq(struct request *rq)
@@ -205,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
struct list_head *running;
struct request *rq, *n;
unsigned long flags = 0;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
@@ -341,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx;
unsigned long flags;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(ctx);
if (q->elevator) {
WARN_ON(rq->tag < 0);
@@ -382,7 +382,7 @@ static void blk_rq_init_flush(struct request *rq)
bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx);
bool supports_fua = q->limits.features & BLK_FEAT_FUA;
unsigned int policy = 0;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index ed11438eb4be..5bfd70311359 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2718,8 +2718,7 @@ retry_lock:
* All waiters are on iocg->waitq and the wait states are
* synchronized using waitq.lock.
*/
- init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
- wait.wait.private = current;
+ init_wait_func(&wait.wait, iocg_wake_fn);
wait.bio = bio;
wait.abs_cost = abs_cost;
wait.committed = false; /* will be set true by waker */
@@ -3223,14 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
- unsigned int memflags;
+ unsigned long memflags;
int ret;
blkg_conf_init(&ctx, input);
- ret = blkg_conf_open_bdev(&ctx);
- if (ret)
+ memflags = blkg_conf_open_bdev_frozen(&ctx);
+ if (IS_ERR_VALUE(memflags)) {
+ ret = memflags;
goto err;
+ }
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3247,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
- memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
spin_lock_irq(&ioc->lock);
@@ -3347,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
wbt_enable_default(disk);
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue, memflags);
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
-
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue, memflags);
-
ret = -EINVAL;
err:
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return ret;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1d1589c35297..fdd4efb54c6c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -551,8 +551,8 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
* Map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries.
*/
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist, struct scatterlist **last_sg)
+int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
+ struct scatterlist **last_sg)
{
struct req_iterator iter = {
.bio = rq->bio,
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index adf5f0697b6b..3421b5521fe2 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -347,9 +347,14 @@ static int hctx_busy_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
struct show_busy_params params = { .m = m, .hctx = hctx };
+ int res;
+ res = mutex_lock_interruptible(&hctx->queue->elevator_lock);
+ if (res)
+ return res;
blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq,
&params);
+ mutex_unlock(&hctx->queue->elevator_lock);
return 0;
}
@@ -400,15 +405,14 @@ static int hctx_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
blk_mq_debugfs_tags_show(m, hctx->tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
@@ -417,15 +421,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_show(void *data, struct seq_file *m)
@@ -434,15 +437,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
blk_mq_debugfs_tags_show(m, hctx->sched_tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
@@ -451,15 +453,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_active_show(void *data, struct seq_file *m)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7442ca27c2bf..109611445d40 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -349,7 +349,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
}
ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ hctx = blk_mq_map_queue(bio->bi_opf, ctx);
type = hctx->type;
if (list_empty_careful(&ctx->rq_lists[type]))
goto out_put;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3feeeccf8a99..24656980f443 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
if (!entry->show)
return -EIO;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
res = entry->show(hctx, page);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
return res;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b9f417d980b4..d880c50629d6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -190,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
sbitmap_finish_wait(bt, ws, &wait);
data->ctx = blk_mq_get_ctx(data->q);
- data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
- data->ctx);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40490ac88045..ae8494d88897 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -508,7 +508,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
- data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
if (q->elevator) {
/*
@@ -3314,6 +3314,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
+ rq->nr_integrity_segments = rq_src->nr_integrity_segments;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
@@ -4094,6 +4095,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
+ mutex_lock(&q->elevator_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -4198,6 +4201,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
+
+ mutex_unlock(&q->elevator_lock);
}
/*
@@ -4467,7 +4472,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
unsigned long i, j;
/* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4500,7 +4505,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
/* unregister cpuhp callbacks for exited hctxs */
blk_mq_remove_hw_queues_cpuhp(q);
@@ -4933,10 +4938,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
if (!qe)
return false;
- /* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
+ /* Accessing q->elevator needs protection from ->elevator_lock. */
+ mutex_lock(&q->elevator_lock);
- /* the check has to be done with holding sysfs_lock */
if (!q->elevator) {
kfree(qe);
goto unlock;
@@ -4950,7 +4954,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
list_add(&qe->node, head);
elevator_disable(q);
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
return true;
}
@@ -4980,11 +4984,11 @@ static void blk_mq_elv_switch_back(struct list_head *head,
list_del(&qe->node);
kfree(qe);
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 44979e92b79f..3011a78cf16a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -100,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
- * @q: request queue
* @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- blk_opf_t opf,
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
struct blk_mq_ctx *ctx)
{
return ctx->hctxs[blk_mq_get_hctx_type(opf)];
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d4d4f4dc0e23..95982bc46ba1 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
struct rq_qos_wait_data {
struct wait_queue_entry wq;
- struct task_struct *task;
struct rq_wait *rqw;
acquire_inflight_cb_t *cb;
void *private_data;
@@ -218,7 +217,20 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
- wake_up_process(data->task);
+ /*
+ * autoremove_wake_function() removes the wait entry only when it
+ * actually changed the task state. We want the wait always removed.
+ * Remove explicitly and use default_wake_function().
+ */
+ default_wake_function(curr, mode, wake_flags, key);
+ /*
+ * Note that the order of operations is important as finish_wait()
+ * tests whether @curr is removed without grabbing the lock. This
+ * should be the last thing to do to make sure we will not have a
+ * UAF access to @data. And the semantics of memory barrier in it
+ * also make sure the waiter will see the latest @data->got_token
+ * once list_empty_careful() in finish_wait() returns true.
+ */
list_del_init_careful(&curr->entry);
return 1;
}
@@ -244,41 +256,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
cleanup_cb_t *cleanup_cb)
{
struct rq_qos_wait_data data = {
- .wq = {
- .func = rq_qos_wake_function,
- .entry = LIST_HEAD_INIT(data.wq.entry),
- },
- .task = current,
- .rqw = rqw,
- .cb = acquire_inflight_cb,
- .private_data = private_data,
+ .rqw = rqw,
+ .cb = acquire_inflight_cb,
+ .private_data = private_data,
+ .got_token = false,
};
- bool has_sleeper;
+ bool first_waiter;
- has_sleeper = wq_has_sleeper(&rqw->wait);
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+ /*
+ * If there are no waiters in the waiting queue, try to increase the
+ * inflight counter if we can. Otherwise, prepare for adding ourselves
+ * to the waiting queue.
+ */
+ if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
return;
- has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+ init_wait_func(&data.wq, rq_qos_wake_function);
+ first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
+ /*
+ * Make sure there is at least one inflight process; otherwise, waiters
+ * will never be woken up. Since there may be no inflight process before
+ * adding ourselves to the waiting queue above, we need to try to
+ * increase the inflight counter for ourselves. And it is sufficient to
+ * guarantee that at least the first waiter to enter the waiting queue
+ * will re-check the waiting condition before going to sleep, thus
+ * ensuring forward progress.
+ */
+ if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
+ finish_wait(&rqw->wait, &data.wq);
+ /*
+ * We raced with rq_qos_wake_function() getting a token,
+ * which means we now have two. Put our local token
+ * and wake anyone else potentially waiting for one.
+ *
+ * Enough memory barrier in list_empty_careful() in
+ * finish_wait() is paired with list_del_init_careful()
+ * in rq_qos_wake_function() to make sure we will see
+ * the latest @data->got_token.
+ */
+ if (data.got_token)
+ cleanup_cb(rqw, private_data);
+ return;
+ }
+
+ /* we are now relying on the waker to increase our inflight counter. */
do {
- /* The memory barrier in set_current_state saves us here. */
if (data.got_token)
break;
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
- finish_wait(&rqw->wait, &data.wq);
-
- /*
- * We raced with rq_qos_wake_function() getting a token,
- * which means we now have two. Put our local token
- * and wake anyone else potentially waiting for one.
- */
- if (data.got_token)
- cleanup_cb(rqw, private_data);
- return;
- }
io_schedule();
- has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b9c6f0ec1c49..6b2dbe645d23 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -21,7 +21,7 @@
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
- q->rq_timeout = timeout;
+ WRITE_ONCE(q->rq_timeout, timeout);
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
@@ -114,9 +114,15 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
pr_warn("invalid PI settings.\n");
return -EINVAL;
}
+ bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY;
return 0;
}
+ if (lim->features & BLK_FEAT_BOUNCE_HIGH) {
+ pr_warn("no bounce buffer support for integrity metadata\n");
+ return -EINVAL;
+ }
+
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
pr_warn("integrity support disabled.\n");
return -EINVAL;
@@ -867,36 +873,28 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY))
return true;
- if (!ti->tuple_size) {
- /* inherit the settings from the first underlying device */
- if (!(ti->flags & BLK_INTEGRITY_STACKED)) {
- ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE |
- (bi->flags & BLK_INTEGRITY_REF_TAG);
- ti->csum_type = bi->csum_type;
- ti->tuple_size = bi->tuple_size;
- ti->pi_offset = bi->pi_offset;
- ti->interval_exp = bi->interval_exp;
- ti->tag_size = bi->tag_size;
- goto done;
- }
- if (!bi->tuple_size)
- goto done;
+ if (ti->flags & BLK_INTEGRITY_STACKED) {
+ if (ti->tuple_size != bi->tuple_size)
+ goto incompatible;
+ if (ti->interval_exp != bi->interval_exp)
+ goto incompatible;
+ if (ti->tag_size != bi->tag_size)
+ goto incompatible;
+ if (ti->csum_type != bi->csum_type)
+ goto incompatible;
+ if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
+ (bi->flags & BLK_INTEGRITY_REF_TAG))
+ goto incompatible;
+ } else {
+ ti->flags = BLK_INTEGRITY_STACKED;
+ ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
+ (bi->flags & BLK_INTEGRITY_REF_TAG);
+ ti->csum_type = bi->csum_type;
+ ti->tuple_size = bi->tuple_size;
+ ti->pi_offset = bi->pi_offset;
+ ti->interval_exp = bi->interval_exp;
+ ti->tag_size = bi->tag_size;
}
-
- if (ti->tuple_size != bi->tuple_size)
- goto incompatible;
- if (ti->interval_exp != bi->interval_exp)
- goto incompatible;
- if (ti->tag_size != bi->tag_size)
- goto incompatible;
- if (ti->csum_type != bi->csum_type)
- goto incompatible;
- if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
- (bi->flags & BLK_INTEGRITY_REF_TAG))
- goto incompatible;
-
-done:
- ti->flags |= BLK_INTEGRITY_STACKED;
return true;
incompatible:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6f548a4376aa..a2882751f0d2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -23,10 +23,11 @@
struct queue_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct gendisk *disk, char *page);
+ ssize_t (*show_limit)(struct gendisk *disk, char *page);
+
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
int (*store_limit)(struct gendisk *disk, const char *page,
size_t count, struct queue_limits *lim);
- void (*load_module)(struct gendisk *disk, const char *page, size_t count);
};
static ssize_t
@@ -52,7 +53,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
static ssize_t queue_requests_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->queue->nr_requests, page);
+ ssize_t ret;
+
+ mutex_lock(&disk->queue->elevator_lock);
+ ret = queue_var_show(disk->queue->nr_requests, page);
+ mutex_unlock(&disk->queue->elevator_lock);
+ return ret;
}
static ssize_t
@@ -60,27 +66,38 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long nr;
int ret, err;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
- if (!queue_is_mq(disk->queue))
+ if (!queue_is_mq(q))
return -EINVAL;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
return ret;
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
err = blk_mq_update_nr_requests(disk->queue, nr);
if (err)
- return err;
-
+ ret = err;
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
static ssize_t queue_ra_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ ssize_t ret;
+
+ mutex_lock(&disk->queue->limits_lock);
+ ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ mutex_unlock(&disk->queue->limits_lock);
+
+ return ret;
}
static ssize_t
@@ -88,11 +105,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
+ /*
+ * ->ra_pages is protected by ->limits_lock because it is usually
+ * calculated from the queue limits by queue_limits_commit_update.
+ */
+ mutex_lock(&q->limits_lock);
+ memflags = blk_mq_freeze_queue(q);
disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+ mutex_unlock(&q->limits_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+
return ret;
}
@@ -238,8 +266,9 @@ static ssize_t queue_poll_show(struct gendisk *disk, char *page)
{
if (queue_is_mq(disk->queue))
return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
+
return sysfs_emit(page, "%u\n",
- !!(disk->queue->limits.features & BLK_FEAT_POLL));
+ !!(disk->queue->limits.features & BLK_FEAT_POLL));
}
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
@@ -286,17 +315,21 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page,
size_t count)
{
unsigned long nm;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0)
return ret;
- blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue);
+ memflags = blk_mq_freeze_queue(q);
+ blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+ blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
if (nm == 2)
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
else if (nm)
- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -316,11 +349,19 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
#ifdef CONFIG_SMP
struct request_queue *q = disk->queue;
unsigned long val;
+ unsigned int memflags;
ret = queue_var_store(&val, page, count);
if (ret < 0)
return ret;
+ /*
+ * Here we update two queue flags each using atomic bitops, although
+ * updating two flags isn't atomic it should be harmless as those flags
+ * are accessed individually using atomic test_bit operation. So we
+ * don't grab any lock while updating these flags.
+ */
+ memflags = blk_mq_freeze_queue(q);
if (val == 2) {
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
@@ -331,6 +372,7 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
}
+ blk_mq_unfreeze_queue(q, memflags);
#endif
return ret;
}
@@ -344,29 +386,43 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page,
static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
size_t count)
{
- if (!(disk->queue->limits.features & BLK_FEAT_POLL))
- return -EINVAL;
+ unsigned int memflags;
+ ssize_t ret = count;
+ struct request_queue *q = disk->queue;
+
+ memflags = blk_mq_freeze_queue(q);
+ if (!(q->limits.features & BLK_FEAT_POLL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
- return count;
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ return ret;
}
static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
{
- return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout));
+ return sysfs_emit(page, "%u\n",
+ jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout)));
}
static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
size_t count)
{
- unsigned int val;
+ unsigned int val, memflags;
int err;
+ struct request_queue *q = disk->queue;
err = kstrtou32(page, 10, &val);
if (err || val == 0)
return -EINVAL;
- blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val));
+ memflags = blk_mq_freeze_queue(q);
+ blk_queue_rq_timeout(q, msecs_to_jiffies(val));
+ blk_mq_unfreeze_queue(q, memflags);
return count;
}
@@ -412,57 +468,55 @@ static struct queue_sysfs_entry _prefix##_entry = { \
.store = _prefix##_store, \
};
-#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
- .attr = { .name = _name, .mode = 0644 }, \
- .show = _prefix##_show, \
- .store_limit = _prefix##_store, \
+ .attr = { .name = _name, .mode = 0444 }, \
+ .show_limit = _prefix##_show, \
}
-#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \
-static struct queue_sysfs_entry _prefix##_entry = { \
+#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
- .show = _prefix##_show, \
- .load_module = _prefix##_load_module, \
- .store = _prefix##_store, \
+ .show_limit = _prefix##_show, \
+ .store_limit = _prefix##_store, \
}
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
-QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
-QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
-QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
-
-QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
-QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
-QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
-QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
-QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
-
-QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
-QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
-QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
+QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_RW_ENTRY(elv_iosched, "scheduler");
+
+QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
+QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size");
+QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size");
+
+QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
-QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors,
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors,
"atomic_write_boundary_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
-QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
-QUEUE_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
-QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
-QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
@@ -470,16 +524,16 @@ QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
-QUEUE_RO_ENTRY(queue_fua, "fua");
-QUEUE_RO_ENTRY(queue_dax, "dax");
+QUEUE_LIM_RO_ENTRY(queue_fua, "fua");
+QUEUE_LIM_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
-QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
-QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
+QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment");
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
- .attr = {.name = "hw_sector_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
+ .attr = {.name = "hw_sector_size", .mode = 0444 },
+ .show_limit = queue_logical_block_size_show,
};
QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
@@ -503,14 +557,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page)
static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
{
- if (!wbt_rq_qos(disk->queue))
- return -EINVAL;
+ ssize_t ret;
+ struct request_queue *q = disk->queue;
- if (wbt_disabled(disk->queue))
- return sysfs_emit(page, "0\n");
+ mutex_lock(&q->elevator_lock);
+ if (!wbt_rq_qos(q)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (wbt_disabled(q)) {
+ ret = sysfs_emit(page, "0\n");
+ goto out;
+ }
- return sysfs_emit(page, "%llu\n",
- div_u64(wbt_get_min_lat(disk->queue), 1000));
+ ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
+out:
+ mutex_unlock(&q->elevator_lock);
+ return ret;
}
static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
@@ -520,6 +584,7 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
struct rq_qos *rqos;
ssize_t ret;
s64 val;
+ unsigned int memflags;
ret = queue_var_store64(&val, page);
if (ret < 0)
@@ -527,20 +592,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
if (val < -1)
return -EINVAL;
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
+
rqos = wbt_rq_qos(q);
if (!rqos) {
ret = wbt_init(disk);
if (ret)
- return ret;
+ goto out;
}
+ ret = count;
if (val == -1)
val = wbt_default_latency_nsec(q);
else if (val >= 0)
val *= 1000ULL;
if (wbt_get_min_lat(q) == val)
- return count;
+ goto out;
/*
* Ensure that the queue is idled, in case the latency update
@@ -552,8 +621,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
wbt_set_min_lat(q, val);
blk_mq_unquiesce_queue(q);
+out:
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
- return count;
+ return ret;
}
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
@@ -561,7 +633,9 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
- &queue_ra_entry.attr,
+ /*
+ * Attributes which are protected with q->limits_lock.
+ */
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
@@ -577,44 +651,58 @@ static struct attribute *queue_attrs[] = {
&queue_discard_granularity_entry.attr,
&queue_max_discard_sectors_entry.attr,
&queue_max_hw_discard_sectors_entry.attr,
- &queue_discard_zeroes_data_entry.attr,
&queue_atomic_write_max_sectors_entry.attr,
&queue_atomic_write_boundary_sectors_entry.attr,
&queue_atomic_write_unit_min_entry.attr,
&queue_atomic_write_unit_max_entry.attr,
- &queue_write_same_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
&queue_zoned_entry.attr,
- &queue_nr_zones_entry.attr,
&queue_max_open_zones_entry.attr,
&queue_max_active_zones_entry.attr,
- &queue_nomerges_entry.attr,
&queue_iostats_passthrough_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
&queue_add_random_entry.attr,
- &queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
&queue_dax_entry.attr,
- &queue_poll_delay_entry.attr,
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
+ &queue_ra_entry.attr,
+
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_discard_zeroes_data_entry.attr,
+ &queue_write_same_max_entry.attr,
+ &queue_nr_zones_entry.attr,
+ &queue_nomerges_entry.attr,
+ &queue_poll_entry.attr,
+ &queue_poll_delay_entry.attr,
+
NULL,
};
/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
- &queue_requests_entry.attr,
+ /*
+ * Attributes which require some form of locking other than
+ * q->sysfs_lock.
+ */
&elv_iosched_entry.attr,
- &queue_rq_affinity_entry.attr,
- &queue_io_timeout_entry.attr,
+ &queue_requests_entry.attr,
#ifdef CONFIG_BLK_WBT
&queue_wb_lat_entry.attr,
#endif
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_rq_affinity_entry.attr,
+ &queue_io_timeout_entry.attr,
+
NULL,
};
@@ -664,14 +752,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
- ssize_t res;
- if (!entry->show)
+ if (!entry->show && !entry->show_limit)
return -EIO;
- mutex_lock(&disk->queue->sysfs_lock);
- res = entry->show(disk, page);
- mutex_unlock(&disk->queue->sysfs_lock);
- return res;
+
+ if (entry->show_limit) {
+ ssize_t res;
+
+ mutex_lock(&disk->queue->limits_lock);
+ res = entry->show_limit(disk, page);
+ mutex_unlock(&disk->queue->limits_lock);
+ return res;
+ }
+
+ return entry->show(disk, page);
}
static ssize_t
@@ -681,21 +775,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
- unsigned int memflags;
- ssize_t res;
if (!entry->store_limit && !entry->store)
return -EIO;
- /*
- * If the attribute needs to load a module, do it before freezing the
- * queue to ensure that the module file can be read when the request
- * queue is the one for the device storing the module file.
- */
- if (entry->load_module)
- entry->load_module(disk, page, length);
-
if (entry->store_limit) {
+ ssize_t res;
+
struct queue_limits lim = queue_limits_start_update(q);
res = entry->store_limit(disk, page, length, &lim);
@@ -710,12 +796,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
return length;
}
- mutex_lock(&q->sysfs_lock);
- memflags = blk_mq_freeze_queue(q);
- res = entry->store(disk, page, length);
- blk_mq_unfreeze_queue(q, memflags);
- mutex_unlock(&q->sysfs_lock);
- return res;
+ return entry->store(disk, page, length);
}
static const struct sysfs_ops queue_sysfs_ops = {
@@ -784,18 +865,22 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_debugfs_remove;
+ ret = blk_crypto_sysfs_register(disk);
+ if (ret)
+ goto out_unregister_ia_ranges;
+
+ mutex_lock(&q->elevator_lock);
if (q->elevator) {
ret = elv_register_queue(q, false);
- if (ret)
- goto out_unregister_ia_ranges;
+ if (ret) {
+ mutex_unlock(&q->elevator_lock);
+ goto out_crypto_sysfs_unregister;
+ }
}
-
- ret = blk_crypto_sysfs_register(disk);
- if (ret)
- goto out_elv_unregister;
+ wbt_enable_default(disk);
+ mutex_unlock(&q->elevator_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
- wbt_enable_default(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
@@ -817,8 +902,8 @@ int blk_register_queue(struct gendisk *disk)
return ret;
-out_elv_unregister:
- elv_unregister_queue(q);
+out_crypto_sysfs_unregister:
+ blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -864,8 +949,11 @@ void blk_unregister_queue(struct gendisk *disk)
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(disk);
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elv_unregister_queue(q);
+ mutex_unlock(&q->elevator_lock);
+
+ mutex_lock(&q->sysfs_lock);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8d149aff9fd0..91dab43c65ab 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -478,8 +478,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
@@ -498,16 +496,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
}
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
- bool clear_carryover)
+ bool clear)
{
- tg->bytes_disp[rw] = 0;
- tg->io_disp[rw] = 0;
+ if (clear) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ }
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- if (clear_carryover) {
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
- }
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -599,29 +595,34 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start.
*/
-
throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = rounddown(jiffies - tg->slice_start[rw],
tg->td->throtl_slice);
- if (!time_elapsed)
+ /* Don't trim slice until at least 2 slices are used */
+ if (time_elapsed < tg->td->throtl_slice * 2)
return;
+ /*
+ * The bio submission time may be a few jiffies more than the expected
+ * waiting time, due to 'extra_bytes' can't be divided in
+ * tg_within_bps_limit(), and also due to timer wakeup delay. In this
+ * case, adjust slice_start will discard the extra wait time, causing
+ * lower rate than expected. Therefore, other than the above rounddown,
+ * one extra slice is preserved for deviation.
+ */
+ time_elapsed -= tg->td->throtl_slice;
bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
- time_elapsed) +
- tg->carryover_bytes[rw];
- io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) +
- tg->carryover_ios[rw];
+ time_elapsed);
+ io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed);
if (bytes_trim <= 0 && io_trim <= 0)
return;
- tg->carryover_bytes[rw] = 0;
if ((long long)tg->bytes_disp[rw] >= bytes_trim)
tg->bytes_disp[rw] -= bytes_trim;
else
tg->bytes_disp[rw] = 0;
- tg->carryover_ios[rw] = 0;
if ((int)tg->io_disp[rw] >= io_trim)
tg->io_disp[rw] -= io_trim;
else
@@ -636,7 +637,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
jiffies);
}
-static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
+static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
+ long long *bytes, int *ios)
{
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
@@ -649,26 +651,28 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
* configuration.
*/
if (bps_limit != U64_MAX)
- tg->carryover_bytes[rw] +=
- calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
+ *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
tg->bytes_disp[rw];
if (iops_limit != UINT_MAX)
- tg->carryover_ios[rw] +=
- calculate_io_allowed(iops_limit, jiffy_elapsed) -
+ *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
tg->io_disp[rw];
+ tg->bytes_disp[rw] -= *bytes;
+ tg->io_disp[rw] -= *ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
{
+ long long bytes[2] = {0};
+ int ios[2] = {0};
+
if (tg->service_queue.nr_queued[READ])
- __tg_update_carryover(tg, READ);
+ __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
if (tg->service_queue.nr_queued[WRITE])
- __tg_update_carryover(tg, WRITE);
+ __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
/* see comments in struct throtl_grp for meaning of these fields. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
- tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
- tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
+ bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
@@ -686,8 +690,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
- io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
- tg->carryover_ios[rw];
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd);
if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed)
return 0;
@@ -720,8 +723,7 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
- tg->carryover_bytes[rw];
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
return 0;
@@ -810,13 +812,10 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED))
tg->bytes_disp[rw] += bio_size;
- tg->last_bytes_disp[rw] += bio_size;
- }
tg->io_disp[rw]++;
- tg->last_io_disp[rw]++;
}
/**
@@ -1614,13 +1613,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
return tg_may_dispatch(tg, bio, NULL);
}
-static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
-{
- if (!bio_flagged(bio, BIO_BPS_THROTTLED))
- tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
- tg->carryover_ios[rw]--;
-}
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1657,10 +1649,12 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* IOs which may cause priority inversions are
* dispatched directly, even if they're over limit.
- * Debts are handled by carryover_bytes/ios while
- * calculating wait time.
+ *
+ * Charge and dispatch directly, and our throttle
+ * control algorithm is adaptive, and extra IO bytes
+ * will be throttled for paying the debt
*/
- tg_dispatch_in_debt(tg, bio, rw);
+ throtl_charge_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 1a36d1278eea..7964cc041e06 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -102,12 +102,9 @@ struct throtl_grp {
unsigned int iops[2];
/* Number of bytes dispatched in current slice */
- uint64_t bytes_disp[2];
+ int64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
- unsigned int io_disp[2];
-
- uint64_t last_bytes_disp[2];
- unsigned int last_io_disp[2];
+ int io_disp[2];
/*
* The following two fields are updated when new configuration is
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6dfc659d22e2..f1754d07f7e0 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -136,8 +136,9 @@ enum {
RWB_MIN_WRITE_SAMPLES = 3,
/*
- * If we have this number of consecutive windows with not enough
- * information to scale up or down, scale up.
+ * If we have this number of consecutive windows without enough
+ * information to scale up or down, slowly return to center state
+ * (step == 0).
*/
RWB_UNKNOWN_BUMP = 5,
};
@@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
break;
case LAT_UNKNOWN_WRITES:
/*
- * We started a the center step, but don't have a valid
- * read/write sample, but we do have writes going on.
- * Allow step to go negative, to increase write perf.
+ * We don't have a valid read/write sample, but we do have
+ * writes going on. Allow step to go negative, to increase
+ * write performance.
*/
scale_up(rwb);
break;
@@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
__wbt_done(rqos, flags);
}
-/*
- * May sleep, if we have exceeded the writeback limits. Caller can pass
- * in an irq held spinlock, if it holds one when calling this function.
- * If we do sleep, we'll release and re-grab it.
- */
+/* May sleep, if we have exceeded the writeback limits. */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
struct rq_wb *rwb = RQWB(rqos);
diff --git a/block/blk.h b/block/blk.h
index 9cf9a0099416..006e3be433d2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -715,7 +715,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
void blk_integrity_generate(struct bio *bio);
-void blk_integrity_verify(struct bio *bio);
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
diff --git a/block/bounce.c b/block/bounce.c
index 0d898cd5ec49..09a9616cf209 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -41,8 +41,6 @@ static void init_bounce_bioset(void)
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
BUG_ON(ret);
- if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
- BUG_ON(1);
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
BUG_ON(ret);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 93523d8f8195..9ceb5d0832f5 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
- buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+ buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list);
buf->payload_len = blk_rq_bytes(req);
return 0;
}
diff --git a/block/elevator.c b/block/elevator.c
index cd2ce4921601..b4d08026b02c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
@@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
unsigned int memflags;
int ret;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q)
{
unsigned int memflags;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -700,34 +700,44 @@ static int elevator_change(struct request_queue *q, const char *elevator_name)
return ret;
}
-void elv_iosched_load_module(struct gendisk *disk, const char *buf,
- size_t count)
+static void elv_iosched_load_module(char *elevator_name)
{
- char elevator_name[ELV_NAME_MAX];
struct elevator_type *found;
- const char *name;
-
- strscpy(elevator_name, buf, sizeof(elevator_name));
- name = strstrip(elevator_name);
spin_lock(&elv_list_lock);
- found = __elevator_find(name);
+ found = __elevator_find(elevator_name);
spin_unlock(&elv_list_lock);
if (!found)
- request_module("%s-iosched", name);
+ request_module("%s-iosched", elevator_name);
}
ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
+ char *name;
int ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
+ /*
+ * If the attribute needs to load a module, do it before freezing the
+ * queue to ensure that the module file can be read when the request
+ * queue is the one for the device storing the module file.
+ */
strscpy(elevator_name, buf, sizeof(elevator_name));
- ret = elevator_change(disk->queue, strstrip(elevator_name));
+ name = strstrip(elevator_name);
+
+ elv_iosched_load_module(name);
+
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
+ ret = elevator_change(q, name);
if (!ret)
- return count;
+ ret = count;
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -738,6 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
struct elevator_type *cur = NULL, *e;
int len = 0;
+ mutex_lock(&q->elevator_lock);
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
} else {
@@ -755,6 +766,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
spin_unlock(&elv_list_lock);
len += sprintf(name+len, "\n");
+ mutex_unlock(&q->elevator_lock);
+
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
index e526662c5dbb..e4e44dfac503 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *);
* io scheduler sysfs switching
*/
ssize_t elv_iosched_show(struct gendisk *disk, char *page);
-void elv_iosched_load_module(struct gendisk *disk, const char *page,
- size_t count);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
diff --git a/block/genhd.c b/block/genhd.c
index e9375e20d866..c2bd86cd09de 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -565,8 +565,11 @@ out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
out_exit_elevator:
- if (disk->queue->elevator)
+ if (disk->queue->elevator) {
+ mutex_lock(&disk->queue->elevator_lock);
elevator_exit(disk->queue);
+ mutex_unlock(&disk->queue->elevator_lock);
+ }
return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk)
blk_mq_quiesce_queue(q);
if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
}
rq_qos_exit(q);
blk_mq_unquiesce_queue(q);
diff --git a/block/ioctl.c b/block/ioctl.c
index 6554b728bae6..faa40f383e27 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
+#include "blk-crypto-internal.h"
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
@@ -620,6 +621,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKCRYPTOIMPORTKEY:
+ case BLKCRYPTOGENERATEKEY:
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, mode, argp);
case IOC_PR_RESERVE:
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index dc31f2dfa414..0f0f8452609a 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx);
struct kyber_hctx_data *khd = hctx->sched_data;
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
index 9cc6b8c1eea4..b5ecddd5181a 100644
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state)
p = &label->partitions[0];
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
- /*printk("Dev %s SGI disklabel: bad magic %08x\n",
- state->disk->disk_name, be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
index ddf9e6def4b2..2419af76120f 100644
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state)
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
-/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- state->disk->disk_name, be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2577114ff20c..851db518ee5e 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio)
}
}
-void blk_integrity_verify(struct bio *bio)
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio)
*/
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
- iter.seed = bip->bio_iter.bi_sector;
+ iter.seed = saved_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
- __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) {
+ __bio_for_each_segment(bv, bio, bviter, *saved_iter) {
void *kaddr = bvec_kmap_local(&bv);
blk_status_t ret = BLK_STS_OK;