summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 10:16:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 10:16:56 -0700
commite1b1d03ceec343362524318c076b110066ffe305 (patch)
treeca71105f13d893118eab4feb826d20cca066c53d /drivers
parent5832d26433f2bd0d28f8b12526e3c2fdb203507f (diff)
parent130e6de62107116eba124647116276266be0f84c (diff)
Merge tag 'for-6.18/block-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - NVMe pull request via Keith: - FC target fixes (Daniel) - Authentication fixes and updates (Martin, Chris) - Admin controller handling (Kamaljit) - Target lockdep assertions (Max) - Keep-alive updates for discovery (Alastair) - Suspend quirk (Georg) - MD pull request via Yu: - Add support for a lockless bitmap. A key feature for the new bitmap are that the IO fastpath is lockless. If a user issues lots of write IO to the same bitmap bit in a short time, only the first write has additional overhead to update bitmap bit, no additional overhead for the following writes. By supporting only resync or recover written data, means in the case creating new array or replacing with a new disk, there is no need to do a full disk resync/recovery. - Switch ->getgeo() and ->bios_param() to using struct gendisk rather than struct block_device. - Rust block changes via Andreas. This series adds configuration via configfs and remote completion to the rnull driver. The series also includes a set of changes to the rust block device driver API: a few cleanup patches, and a few features supporting the rnull changes. The series removes the raw buffer formatting logic from `kernel::block` and improves the logic available in `kernel::string` to support the same use as the removed logic. - floppy arch cleanups - Reduce the number of dereferencing needed for ublk commands - Restrict supported sockets for nbd. Mostly done to eliminate a class of issues perpetually reported by syzbot, by using nonsensical socket setups. - A few s390 dasd block fixes - Fix a few issues around atomic writes - Improve DMA interation for integrity requests - Improve how iovecs are treated with regards to O_DIRECT aligment constraints. We used to require each segment to adhere to the constraints, now only the request as a whole needs to. - Clean up and improve p2p support, enabling use of p2p for metadata payloads - Improve locking of request lookup, using SRCU where appropriate - Use page references properly for brd, avoiding very long RCU sections - Fix ordering of recursively submitted IOs - Clean up and improve updating nr_requests for a live device - Various fixes and cleanups * tag 'for-6.18/block-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (164 commits) s390/dasd: enforce dma_alignment to ensure proper buffer validation s390/dasd: Return BLK_STS_INVAL for EINVAL from do_dasd_request ublk: remove redundant zone op check in ublk_setup_iod() nvme: Use non zero KATO for persistent discovery connections nvmet: add safety check for subsys lock nvme-core: use nvme_is_io_ctrl() for I/O controller check nvme-core: do ioccsz/iorcsz validation only for I/O controllers nvme-core: add method to check for an I/O controller blk-cgroup: fix possible deadlock while configuring policy blk-mq: fix null-ptr-deref in blk_mq_free_tags() from error path blk-mq: Fix more tag iteration function documentation selftests: ublk: fix behavior when fio is not installed ublk: don't access ublk_queue in ublk_unmap_io() ublk: pass ublk_io to __ublk_complete_rq() ublk: don't access ublk_queue in ublk_need_complete_req() ublk: don't access ublk_queue in ublk_check_commit_and_fetch() ublk: don't pass ublk_queue to ublk_fetch() ublk: don't access ublk_queue in ublk_config_io_buf() ublk: don't access ublk_queue in ublk_check_fetch_buf() ublk: pass q_id and tag to __ublk_check_and_get_req() ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/ata/libata-scsi.c4
-rw-r--r--drivers/block/Kconfig10
-rw-r--r--drivers/block/Makefile4
-rw-r--r--drivers/block/amiflop.c10
-rw-r--r--drivers/block/aoe/aoeblk.c4
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/brd.c75
-rw-r--r--drivers/block/floppy.c59
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c6
-rw-r--r--drivers/block/nbd.c10
-rw-r--r--drivers/block/null_blk/main.c2
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt.c6
-rw-r--r--drivers/block/rnull.rs80
-rw-r--r--drivers/block/rnull/Kconfig13
-rw-r--r--drivers/block/rnull/Makefile3
-rw-r--r--drivers/block/rnull/configfs.rs262
-rw-r--r--drivers/block/rnull/rnull.rs104
-rw-r--r--drivers/block/sunvdc.c7
-rw-r--r--drivers/block/swim.c4
-rw-r--r--drivers/block/ublk_drv.c236
-rw-r--r--drivers/block/virtio_blk.c8
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/md/Kconfig29
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/movinggc.c8
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/bcache/writeback.c8
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-raid.c18
-rw-r--r--drivers/md/dm-vdo/vio.c2
-rw-r--r--drivers/md/dm.c4
-rw-r--r--drivers/md/md-bitmap.c89
-rw-r--r--drivers/md/md-bitmap.h107
-rw-r--r--drivers/md/md-cluster.c2
-rw-r--r--drivers/md/md-linear.c14
-rw-r--r--drivers/md/md-llbitmap.c1626
-rw-r--r--drivers/md/md.c382
-rw-r--r--drivers/md/md.h24
-rw-r--r--drivers/md/raid0.c30
-rw-r--r--drivers/md/raid1-10.c2
-rw-r--r--drivers/md/raid1.c119
-rw-r--r--drivers/md/raid1.h4
-rw-r--r--drivers/md/raid10.c107
-rw-r--r--drivers/md/raid10.h2
-rw-r--r--drivers/md/raid5.c74
-rw-r--r--drivers/memstick/core/ms_block.c4
-rw-r--r--drivers/memstick/core/mspro_block.c4
-rw-r--r--drivers/message/fusion/mptscsih.c2
-rw-r--r--drivers/message/fusion/mptscsih.h2
-rw-r--r--drivers/mmc/core/block.c4
-rw-r--r--drivers/mtd/mtd_blkdevs.c4
-rw-r--r--drivers/mtd/ubi/block.c4
-rw-r--r--drivers/nvdimm/btt.c4
-rw-r--r--drivers/nvme/common/auth.c86
-rw-r--r--drivers/nvme/host/auth.c5
-rw-r--r--drivers/nvme/host/core.c23
-rw-r--r--drivers/nvme/host/fc.c10
-rw-r--r--drivers/nvme/host/ioctl.c5
-rw-r--r--drivers/nvme/host/nvme.h2
-rw-r--r--drivers/nvme/host/pci.c184
-rw-r--r--drivers/nvme/host/tcp.c3
-rw-r--r--drivers/nvme/target/core.c15
-rw-r--r--drivers/nvme/target/fc.c35
-rw-r--r--drivers/nvme/target/fcloop.c8
-rw-r--r--drivers/s390/block/dasd.c24
-rw-r--r--drivers/scsi/3w-9xxx.c2
-rw-r--r--drivers/scsi/3w-sas.c2
-rw-r--r--drivers/scsi/3w-xxxx.c2
-rw-r--r--drivers/scsi/BusLogic.c4
-rw-r--r--drivers/scsi/BusLogic.h2
-rw-r--r--drivers/scsi/aacraid/linit.c6
-rw-r--r--drivers/scsi/advansys.c2
-rw-r--r--drivers/scsi/aha152x.c4
-rw-r--r--drivers/scsi/aha1542.c2
-rw-r--r--drivers/scsi/aha1740.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c4
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c6
-rw-r--r--drivers/scsi/atp870u.c2
-rw-r--r--drivers/scsi/fdomain.c4
-rw-r--r--drivers/scsi/imm.c2
-rw-r--r--drivers/scsi/initio.c4
-rw-r--r--drivers/scsi/ipr.c8
-rw-r--r--drivers/scsi/ips.c2
-rw-r--r--drivers/scsi/ips.h2
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c2
-rw-r--r--drivers/scsi/megaraid.c4
-rw-r--r--drivers/scsi/megaraid.h2
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c4
-rw-r--r--drivers/scsi/mpi3mr/mpi3mr_os.c4
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c4
-rw-r--r--drivers/scsi/mvumi.c2
-rw-r--r--drivers/scsi/myrb.c2
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c2
-rw-r--r--drivers/scsi/ppa.c2
-rw-r--r--drivers/scsi/qla1280.c2
-rw-r--r--drivers/scsi/qlogicfas408.c2
-rw-r--r--drivers/scsi/qlogicfas408.h2
-rw-r--r--drivers/scsi/scsicam.c16
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/stex.c2
-rw-r--r--drivers/scsi/storvsc_drv.c2
-rw-r--r--drivers/scsi/wd719x.c2
-rw-r--r--drivers/target/target_core_pscsi.c2
110 files changed, 3270 insertions, 876 deletions
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 2ded5e476d6e..b43a3196e2be 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups);
/**
* ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd.
* @sdev: SCSI device for which BIOS geometry is to be determined
- * @bdev: block device associated with @sdev
+ * @unused: gendisk associated with @sdev
* @capacity: capacity of SCSI device
* @geom: location to which geometry will be output
*
@@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups);
* RETURNS:
* Zero.
*/
-int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
geom[0] = 255;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index df38fb364904..77d694448990 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -17,6 +17,7 @@ menuconfig BLK_DEV
if BLK_DEV
source "drivers/block/null_blk/Kconfig"
+source "drivers/block/rnull/Kconfig"
config BLK_DEV_FD
tristate "Normal floppy disk support"
@@ -311,15 +312,6 @@ config VIRTIO_BLK
This is the virtual block driver for virtio. It can be used with
QEMU based VMMs (like KVM or Xen). Say Y or M.
-config BLK_DEV_RUST_NULL
- tristate "Rust null block driver (Experimental)"
- depends on RUST
- help
- This is the Rust implementation of the null block driver. For now it
- is only a minimal stub.
-
- If unsure, say N.
-
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
depends on INET && BLOCK
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index a695ce74ef22..2d8096eb8cdf 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -9,9 +9,6 @@
# needed for trace events
ccflags-y += -I$(src)
-obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
-rnull_mod-y := rnull.o
-
obj-$(CONFIG_MAC_FLOPPY) += swim3.o
obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
obj-$(CONFIG_BLK_DEV_FD) += floppy.o
@@ -38,6 +35,7 @@ obj-$(CONFIG_ZRAM) += zram/
obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 6357d86eafdc..2932b6653b6f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = MINOR(bdev->bd_dev) & 3;
+ struct amiga_floppy_struct *p = disk->private_data;
- geo->heads = unit[drive].type->heads;
- geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
- geo->cylinders = unit[drive].type->tracks;
+ geo->heads = p->type->heads;
+ geo->sectors = p->dtype->sects * p->type->sect_mult;
+ geo->cylinders = p->type->tracks;
return 0;
}
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 00b74a845328..34ead75e7e02 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx,
}
static int
-aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct aoedev *d = bdev->bd_disk->private_data;
+ struct aoedev *d = disk->private_data;
if ((d->flags & DEVFL_UP) == 0) {
printk(KERN_ERR "aoe: disk not up\n");
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index cdf6e4041bb9..3b21750038ee 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -44,7 +44,7 @@ aoe_init(void)
{
int ret;
- aoe_wq = alloc_workqueue("aoe_wq", 0, 0);
+ aoe_wq = alloc_workqueue("aoe_wq", WQ_PERCPU, 0);
if (!aoe_wq)
return -ENOMEM;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c2eabe14af3..9778259b30d4 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -44,45 +44,74 @@ struct brd_device {
};
/*
- * Look up and return a brd's page for a given sector.
+ * Look up and return a brd's page with reference grabbed for a given sector.
*/
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
- return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ struct page *page;
+ XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+
+ rcu_read_lock();
+repeat:
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page)) {
+ xas_reset(&xas);
+ goto repeat;
+ }
+
+ if (!page)
+ goto out;
+
+ if (!get_page_unless_zero(page)) {
+ xas_reset(&xas);
+ goto repeat;
+ }
+
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(page);
+ xas_reset(&xas);
+ goto repeat;
+ }
+out:
+ rcu_read_unlock();
+
+ return page;
}
/*
* Insert a new page for a given sector, if one does not already exist.
+ * The returned page will grab reference.
*/
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
blk_opf_t opf)
- __releases(rcu)
- __acquires(rcu)
{
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
struct page *page, *ret;
- rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
- if (!page) {
- rcu_read_lock();
+ if (!page)
return ERR_PTR(-ENOMEM);
- }
xa_lock(&brd->brd_pages);
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
page, gfp);
- rcu_read_lock();
- if (ret) {
+ if (!ret) {
+ brd->brd_nr_pages++;
+ get_page(page);
+ xa_unlock(&brd->brd_pages);
+ return page;
+ }
+
+ if (!xa_is_err(ret)) {
+ get_page(ret);
xa_unlock(&brd->brd_pages);
- __free_page(page);
- if (xa_is_err(ret))
- return ERR_PTR(xa_err(ret));
+ put_page(page);
return ret;
}
- brd->brd_nr_pages++;
+
xa_unlock(&brd->brd_pages);
- return page;
+ put_page(page);
+ return ERR_PTR(xa_err(ret));
}
/*
@@ -95,7 +124,7 @@ static void brd_free_pages(struct brd_device *brd)
pgoff_t idx;
xa_for_each(&brd->brd_pages, idx, page) {
- __free_page(page);
+ put_page(page);
cond_resched();
}
@@ -117,7 +146,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
- rcu_read_lock();
page = brd_lookup_page(brd, sector);
if (!page && op_is_write(opf)) {
page = brd_insert_page(brd, sector, opf);
@@ -135,13 +163,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
memset(kaddr, 0, bv.bv_len);
}
kunmap_local(kaddr);
- rcu_read_unlock();
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ if (page)
+ put_page(page);
return true;
out_error:
- rcu_read_unlock();
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
@@ -149,13 +177,6 @@ out_error:
return false;
}
-static void brd_free_one_page(struct rcu_head *head)
-{
- struct page *page = container_of(head, struct page, rcu_head);
-
- __free_page(page);
-}
-
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
@@ -170,7 +191,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- call_rcu(&page->rcu_head, brd_free_one_page);
+ put_page(page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 24be0c2c4075..5336c3c5ca36 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -163,35 +163,35 @@
/* do print messages for unexpected interrupts */
static int print_unex = 1;
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/workqueue.h>
-#include <linux/fdreg.h>
-#include <linux/fd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
+#include <linux/async.h>
#include <linux/bio.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-#include <linux/fcntl.h>
+#include <linux/compat.h>
#include <linux/delay.h>
-#include <linux/mc146818rtc.h> /* CMOS defines */
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/fd.h>
+#include <linux/fdreg.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
#include <linux/major.h>
-#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h> /* CMOS defines */
+#include <linux/mm.h>
#include <linux/mod_devicetable.h>
+#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/timer.h>
#include <linux/uaccess.h>
-#include <linux/async.h>
-#include <linux/compat.h>
+#include <linux/workqueue.h>
/*
* PS/2 floppies have much slower step rates than regular floppies.
@@ -233,8 +233,6 @@ static unsigned short virtual_dma_port = 0x3f0;
irqreturn_t floppy_interrupt(int irq, void *dev_id);
static int set_dor(int fdc, char mask, char data);
-#define K_64 0x10000 /* 64KB */
-
/* the following is the mask of allowed drives. By default units 2 and
* 3 of both floppy controllers are disabled, because switching on the
* motor of these drives causes system hangs on some PCI computers. drive
@@ -3092,16 +3090,13 @@ static int raw_cmd_copyin(int cmd, void __user *param,
*rcmd = NULL;
loop:
- ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL);
- if (!ptr)
- return -ENOMEM;
+ ptr = memdup_user(param, sizeof(*ptr));
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
*rcmd = ptr;
- ret = copy_from_user(ptr, param, sizeof(*ptr));
ptr->next = NULL;
ptr->buffer_length = 0;
ptr->kernel_data = NULL;
- if (ret)
- return -EFAULT;
param += sizeof(struct floppy_raw_cmd);
if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE)
return -EINVAL;
@@ -3363,9 +3358,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
return 0;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = (long)bdev->bd_disk->private_data;
+ int drive = (long)disk->private_data;
int type = ITYPE(drive_state[drive].fd_device);
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8fc7761397bd..567192e371a8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
* that each partition is also 4KB aligned. Non-aligned partitions adversely
* affects performance.
*
- * @dev Pointer to the block_device strucutre.
+ * @disk Pointer to the gendisk strucutre.
* @geo Pointer to a hd_geometry structure.
*
* return value
* 0 Operation completed successfully.
* -ENOTTY An error occurred while reading the drive capacity.
*/
-static int mtip_block_getgeo(struct block_device *dev,
+static int mtip_block_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct driver_data *dd = dev->bd_disk->private_data;
+ struct driver_data *dd = disk->private_data;
sector_t capacity;
if (!dd)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6463d0e8d0ce..1188f32a5e5e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -311,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
@@ -1217,6 +1217,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
if (!sock)
return NULL;
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 91642c9a3b29..f982027e8c85 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -223,7 +223,7 @@ MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed nu
static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
-MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)");
static bool g_fua = true;
module_param_named(fua, g_fua, bool, 0444);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index faafd7ff43d6..af0e21149dbc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -7389,7 +7389,7 @@ static int __init rbd_init(void)
* The number of active work items is limited by the number of
* rbd devices * queue depth, so leave @max_active at default.
*/
- rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!rbd_wq) {
rc = -ENOMEM;
goto err_out_slab;
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 15627417f12e..f1409e54010a 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen)
rnbd_clt_put_dev(dev);
}
-static int rnbd_client_getgeo(struct block_device *block_device,
+static int rnbd_client_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
u64 size;
- struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+ struct rnbd_clt_dev *dev = disk->private_data;
struct queue_limits *limit = &dev->queue->limits;
size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
@@ -1809,7 +1809,7 @@ static int __init rnbd_client_init(void)
unregister_blkdev(rnbd_client_major, "rnbd");
return err;
}
- rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0);
+ rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", WQ_PERCPU, 0);
if (!rnbd_clt_wq) {
pr_err("Failed to load module, alloc_workqueue failed.\n");
rnbd_clt_destroy_sysfs_files();
diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
deleted file mode 100644
index 6366da12c5a5..000000000000
--- a/drivers/block/rnull.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-//! This is a Rust implementation of the C null block driver.
-//!
-//! Supported features:
-//!
-//! - blk-mq interface
-//! - direct completion
-//! - block size 4k
-//!
-//! The driver is not configurable.
-
-use kernel::{
- alloc::flags,
- block::mq::{
- self,
- gen_disk::{self, GenDisk},
- Operations, TagSet,
- },
- error::Result,
- new_mutex, pr_info,
- prelude::*,
- sync::{Arc, Mutex},
- types::ARef,
-};
-
-module! {
- type: NullBlkModule,
- name: "rnull_mod",
- authors: ["Andreas Hindborg"],
- description: "Rust implementation of the C null block driver",
- license: "GPL v2",
-}
-
-#[pin_data]
-struct NullBlkModule {
- #[pin]
- _disk: Mutex<GenDisk<NullBlkDevice>>,
-}
-
-impl kernel::InPlaceModule for NullBlkModule {
- fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
- pr_info!("Rust null_blk loaded\n");
-
- // Use a immediately-called closure as a stable `try` block
- let disk = /* try */ (|| {
- let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
-
- gen_disk::GenDiskBuilder::new()
- .capacity_sectors(4096 << 11)
- .logical_block_size(4096)?
- .physical_block_size(4096)?
- .rotational(false)
- .build(fmt!("rnullb{}", 0), tagset)
- })();
-
- try_pin_init!(Self {
- _disk <- new_mutex!(disk?, "nullb:disk"),
- })
- }
-}
-
-struct NullBlkDevice;
-
-#[vtable]
-impl Operations for NullBlkDevice {
- #[inline(always)]
- fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
- mq::Request::end_ok(rq)
- .map_err(|_e| kernel::error::code::EIO)
- // We take no refcounts on the request, so we expect to be able to
- // end the request. The request reference must be unique at this
- // point, and so `end_ok` cannot fail.
- .expect("Fatal error - expected to be able to end request");
-
- Ok(())
- }
-
- fn commit_rqs() {}
-}
diff --git a/drivers/block/rnull/Kconfig b/drivers/block/rnull/Kconfig
new file mode 100644
index 000000000000..7bc5b376c128
--- /dev/null
+++ b/drivers/block/rnull/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Rust null block device driver configuration
+
+config BLK_DEV_RUST_NULL
+ tristate "Rust null block driver (Experimental)"
+ depends on RUST && CONFIGFS_FS
+ help
+ This is the Rust implementation of the null block driver. Like
+ the C version, the driver allows the user to create virutal block
+ devices that can be configured via various configuration options.
+
+ If unsure, say N.
diff --git a/drivers/block/rnull/Makefile b/drivers/block/rnull/Makefile
new file mode 100644
index 000000000000..11cfa5e615dc
--- /dev/null
+++ b/drivers/block/rnull/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
+rnull_mod-y := rnull.o
diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs
new file mode 100644
index 000000000000..8498e9bae6fd
--- /dev/null
+++ b/drivers/block/rnull/configfs.rs
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use super::{NullBlkDevice, THIS_MODULE};
+use core::fmt::{Display, Write};
+use kernel::{
+ block::mq::gen_disk::{GenDisk, GenDiskBuilder},
+ c_str,
+ configfs::{self, AttributeOperations},
+ configfs_attrs, new_mutex,
+ page::PAGE_SIZE,
+ prelude::*,
+ str::{kstrtobool_bytes, CString},
+ sync::Mutex,
+};
+use pin_init::PinInit;
+
+pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> {
+ let item_type = configfs_attrs! {
+ container: configfs::Subsystem<Config>,
+ data: Config,
+ child: DeviceConfig,
+ attributes: [
+ features: 0,
+ ],
+ };
+
+ kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {}))
+}
+
+#[pin_data]
+pub(crate) struct Config {}
+
+#[vtable]
+impl AttributeOperations<0> for Config {
+ type Data = Config;
+
+ fn show(_this: &Config, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_str("blocksize,size,rotational,irqmode\n")?;
+ Ok(writer.bytes_written())
+ }
+}
+
+#[vtable]
+impl configfs::GroupOperations for Config {
+ type Child = DeviceConfig;
+
+ fn make_group(
+ &self,
+ name: &CStr,
+ ) -> Result<impl PinInit<configfs::Group<DeviceConfig>, Error>> {
+ let item_type = configfs_attrs! {
+ container: configfs::Group<DeviceConfig>,
+ data: DeviceConfig,
+ attributes: [
+ // Named for compatibility with C null_blk
+ power: 0,
+ blocksize: 1,
+ rotational: 2,
+ size: 3,
+ irqmode: 4,
+ ],
+ };
+
+ Ok(configfs::Group::new(
+ name.try_into()?,
+ item_type,
+ // TODO: cannot coerce new_mutex!() to impl PinInit<_, Error>, so put mutex inside
+ try_pin_init!( DeviceConfig {
+ data <- new_mutex!(DeviceConfigInner {
+ powered: false,
+ block_size: 4096,
+ rotational: false,
+ disk: None,
+ capacity_mib: 4096,
+ irq_mode: IRQMode::None,
+ name: name.try_into()?,
+ }),
+ }),
+ ))
+ }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum IRQMode {
+ None,
+ Soft,
+}
+
+impl TryFrom<u8> for IRQMode {
+ type Error = kernel::error::Error;
+
+ fn try_from(value: u8) -> Result<Self> {
+ match value {
+ 0 => Ok(Self::None),
+ 1 => Ok(Self::Soft),
+ _ => Err(EINVAL),
+ }
+ }
+}
+
+impl Display for IRQMode {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self {
+ Self::None => f.write_str("0")?,
+ Self::Soft => f.write_str("1")?,
+ }
+ Ok(())
+ }
+}
+
+#[pin_data]
+pub(crate) struct DeviceConfig {
+ #[pin]
+ data: Mutex<DeviceConfigInner>,
+}
+
+#[pin_data]
+struct DeviceConfigInner {
+ powered: bool,
+ name: CString,
+ block_size: u32,
+ rotational: bool,
+ capacity_mib: u64,
+ irq_mode: IRQMode,
+ disk: Option<GenDisk<NullBlkDevice>>,
+}
+
+#[vtable]
+impl configfs::AttributeOperations<0> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+
+ if this.data.lock().powered {
+ writer.write_str("1\n")?;
+ } else {
+ writer.write_str("0\n")?;
+ }
+
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ let power_op = kstrtobool_bytes(page)?;
+ let mut guard = this.data.lock();
+
+ if !guard.powered && power_op {
+ guard.disk = Some(NullBlkDevice::new(
+ &guard.name,
+ guard.block_size,
+ guard.rotational,
+ guard.capacity_mib,
+ guard.irq_mode,
+ )?);
+ guard.powered = true;
+ } else if guard.powered && !power_op {
+ drop(guard.disk.take());
+ guard.powered = false;
+ }
+
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<1> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().block_size))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u32>().map_err(|_| EINVAL)?;
+
+ GenDiskBuilder::validate_block_size(value)?;
+ this.data.lock().block_size = value;
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<2> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+
+ if this.data.lock().rotational {
+ writer.write_str("1\n")?;
+ } else {
+ writer.write_str("0\n")?;
+ }
+
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ this.data.lock().rotational = kstrtobool_bytes(page)?;
+
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<3> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().capacity_mib))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u64>().map_err(|_| EINVAL)?;
+
+ this.data.lock().capacity_mib = value;
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<4> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().irq_mode))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u8>().map_err(|_| EINVAL)?;
+
+ this.data.lock().irq_mode = IRQMode::try_from(value)?;
+ Ok(())
+ }
+}
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
new file mode 100644
index 000000000000..1ec694d7f1a6
--- /dev/null
+++ b/drivers/block/rnull/rnull.rs
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This is a Rust implementation of the C null block driver.
+
+mod configfs;
+
+use configfs::IRQMode;
+use kernel::{
+ block::{
+ self,
+ mq::{
+ self,
+ gen_disk::{self, GenDisk},
+ Operations, TagSet,
+ },
+ },
+ error::Result,
+ pr_info,
+ prelude::*,
+ sync::Arc,
+ types::ARef,
+};
+use pin_init::PinInit;
+
+module! {
+ type: NullBlkModule,
+ name: "rnull_mod",
+ authors: ["Andreas Hindborg"],
+ description: "Rust implementation of the C null block driver",
+ license: "GPL v2",
+}
+
+#[pin_data]
+struct NullBlkModule {
+ #[pin]
+ configfs_subsystem: kernel::configfs::Subsystem<configfs::Config>,
+}
+
+impl kernel::InPlaceModule for NullBlkModule {
+ fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
+ pr_info!("Rust null_blk loaded\n");
+
+ try_pin_init!(Self {
+ configfs_subsystem <- configfs::subsystem(),
+ })
+ }
+}
+
+struct NullBlkDevice;
+
+impl NullBlkDevice {
+ fn new(
+ name: &CStr,
+ block_size: u32,
+ rotational: bool,
+ capacity_mib: u64,
+ irq_mode: IRQMode,
+ ) -> Result<GenDisk<Self>> {
+ let tagset = Arc::pin_init(TagSet::new(1, 256, 1), GFP_KERNEL)?;
+
+ let queue_data = Box::new(QueueData { irq_mode }, GFP_KERNEL)?;
+
+ gen_disk::GenDiskBuilder::new()
+ .capacity_sectors(capacity_mib << (20 - block::SECTOR_SHIFT))
+ .logical_block_size(block_size)?
+ .physical_block_size(block_size)?
+ .rotational(rotational)
+ .build(fmt!("{}", name.to_str()?), tagset, queue_data)
+ }
+}
+
+struct QueueData {
+ irq_mode: IRQMode,
+}
+
+#[vtable]
+impl Operations for NullBlkDevice {
+ type QueueData = KBox<QueueData>;
+
+ #[inline(always)]
+ fn queue_rq(queue_data: &QueueData, rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
+ match queue_data.irq_mode {
+ IRQMode::None => mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ // We take no refcounts on the request, so we expect to be able to
+ // end the request. The request reference must be unique at this
+ // point, and so `end_ok` cannot fail.
+ .expect("Fatal error - expected to be able to end request"),
+ IRQMode::Soft => mq::Request::complete(rq),
+ }
+ Ok(())
+ }
+
+ fn commit_rqs(_queue_data: &QueueData) {}
+
+ fn complete(rq: ARef<mq::Request<Self>>) {
+ mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ // We take no refcounts on the request, so we expect to be able to
+ // end the request. The request reference must be unique at this
+ // point, and so `end_ok` cannot fail.
+ .expect("Fatal error - expected to be able to end request");
+ }
+}
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7af21fe67671..db1fe9772a4d 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
return vio_dring_avail(dr, VDC_TX_RING_SIZE);
}
-static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct gendisk *disk = bdev->bd_disk;
sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
@@ -1189,7 +1188,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
}
if (port->ldc_timeout)
- mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
+ mod_delayed_work(system_percpu_wq, &port->ldc_reset_timer_work,
round_jiffies(jiffies + HZ * port->ldc_timeout));
mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
return;
@@ -1217,7 +1216,7 @@ static int __init vdc_init(void)
{
int err;
- sunvdc_wq = alloc_workqueue("sunvdc", 0, 0);
+ sunvdc_wq = alloc_workqueue("sunvdc", WQ_PERCPU, 0);
if (!sunvdc_wq)
return -ENOMEM;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index eda33c5eb5e2..416015947ae6 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
-static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct floppy_state *fs = disk->private_data;
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 8fdc26a61104..0c74a41a6753 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -201,7 +201,6 @@ struct ublk_queue {
bool force_abort;
bool canceling;
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
- unsigned short nr_io_ready; /* how many ios setup */
spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
@@ -234,7 +233,7 @@ struct ublk_device {
struct ublk_params params;
struct completion completion;
- unsigned int nr_queues_ready;
+ u32 nr_io_ready;
bool unprivileged_daemons;
struct mutex cancel_mutex;
bool canceling;
@@ -252,8 +251,7 @@ static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io,
- size_t offset);
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *
@@ -532,7 +530,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
-static inline void __ublk_complete_rq(struct request *req);
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -664,22 +663,44 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
}
+static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
}
+static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
!ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
+{
+ return !ublk_dev_support_user_copy(ub) &&
+ !ublk_dev_support_zero_copy(ub) &&
+ !ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
/*
@@ -697,6 +718,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
+{
+ return ublk_dev_support_user_copy(ub) ||
+ ublk_dev_support_zero_copy(ub) ||
+ ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
struct ublk_io *io)
{
@@ -711,8 +739,11 @@ static inline bool ublk_get_req_ref(struct ublk_io *io)
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
{
- if (refcount_dec_and_test(&io->ref))
- __ublk_complete_rq(req);
+ if (!refcount_dec_and_test(&io->ref))
+ return;
+
+ /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
+ __ublk_complete_rq(req, io, false);
}
static inline bool ublk_sub_req_ref(struct ublk_io *io)
@@ -728,6 +759,11 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
+static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
+}
+
/* Called in slow path only, keep it noinline for trace purpose */
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
@@ -764,11 +800,9 @@ static inline int __ublk_queue_cmd_buf_size(int depth)
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
}
-static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
-
- return __ublk_queue_cmd_buf_size(ubq->q_depth);
+ return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
}
static int ublk_max_cmd_buf_size(void)
@@ -1019,13 +1053,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
return rq_bytes;
}
-static int ublk_unmap_io(const struct ublk_queue *ubq,
+static int ublk_unmap_io(bool need_map,
const struct request *req,
const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (!ublk_need_map_io(ubq))
+ if (!need_map)
return rq_bytes;
if (ublk_need_unmap_req(req)) {
@@ -1072,13 +1106,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
- enum req_op op = req_op(req);
u32 ublk_op;
- if (!ublk_queue_is_zoned(ubq) &&
- (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
- return BLK_STS_IOERR;
-
switch (req_op(req)) {
case REQ_OP_READ:
ublk_op = UBLK_IO_OP_READ;
@@ -1117,10 +1146,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
}
/* todo: handle partial completion */
-static inline void __ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map)
{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
@@ -1144,7 +1172,7 @@ static inline void __ublk_complete_rq(struct request *req)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
- unmapped_bytes = ublk_unmap_io(ubq, req, io);
+ unmapped_bytes = ublk_unmap_io(need_map, req, io);
/*
* Extremely impossible since we got data filled in just before
@@ -1500,9 +1528,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
- /* All old ioucmds have to be completed */
- ubq->nr_io_ready = 0;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1551,7 +1576,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
ub->mm = NULL;
- ub->nr_queues_ready = 0;
+ ub->nr_io_ready = 0;
ub->unprivileged_daemons = false;
ub->ublksrv_tgid = -1;
}
@@ -1775,23 +1800,23 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
__func__, q_id, current->pid, vma->vm_start,
phys_off, (unsigned long)sz);
- if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+ if (sz != ublk_queue_cmd_buf_size(ub))
return -EINVAL;
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+ if (ublk_nosrv_should_reissue_outstanding(ub))
blk_mq_requeue_request(req, false);
else {
io->res = -EIO;
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
}
}
@@ -1811,7 +1836,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
- __ublk_fail_req(ubq, io, io->req);
+ __ublk_fail_req(ub, io, io->req);
}
}
@@ -1916,9 +1941,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+static inline bool ublk_dev_ready(const struct ublk_device *ub)
{
- return ubq->nr_io_ready == ubq->q_depth;
+ u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth;
+
+ return ub->nr_io_ready == total;
}
static void ublk_cancel_queue(struct ublk_queue *ubq)
@@ -2042,16 +2069,14 @@ static void ublk_reset_io_flags(struct ublk_device *ub)
}
/* device can only be started after all IOs are ready */
-static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+static void ublk_mark_io_ready(struct ublk_device *ub)
__must_hold(&ub->mutex)
{
- ubq->nr_io_ready++;
- if (ublk_queue_ready(ubq))
- ub->nr_queues_ready++;
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
ub->unprivileged_daemons = true;
- if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
+ ub->nr_io_ready++;
+ if (ublk_dev_ready(ub)) {
/* now we are ready for handling ublk io request */
ublk_reset_io_flags(ub);
complete_all(&ub->completion);
@@ -2122,11 +2147,11 @@ ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
}
static inline int
-ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
+ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
struct io_uring_cmd *cmd, unsigned long buf_addr,
u16 *buf_idx)
{
- if (ublk_support_auto_buf_reg(ubq))
+ if (ublk_dev_support_auto_buf_reg(ub))
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
io->addr = buf_addr;
@@ -2165,18 +2190,18 @@ static void ublk_io_release(void *priv)
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag,
struct ublk_io *io,
unsigned int index, unsigned int issue_flags)
{
- struct ublk_device *ub = cmd->file->private_data;
struct request *req;
int ret;
- if (!ublk_support_zero_copy(ubq))
+ if (!ublk_dev_support_zero_copy(ub))
return -EINVAL;
- req = __ublk_check_and_get_req(ub, ubq, io, 0);
+ req = __ublk_check_and_get_req(ub, q_id, tag, io, 0);
if (!req)
return -EINVAL;
@@ -2192,7 +2217,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
static int
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, struct ublk_io *io,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag, struct ublk_io *io,
unsigned index, unsigned issue_flags)
{
unsigned new_registered_buffers;
@@ -2205,9 +2231,10 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
*/
new_registered_buffers = io->task_registered_buffers + 1;
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
- return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
+ issue_flags);
- if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
+ if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
@@ -2229,14 +2256,14 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
-static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
+static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
{
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
*/
- if (!buf_addr && !ublk_need_get_data(ubq))
+ if (!buf_addr && !ublk_dev_need_get_data(ub))
return -EINVAL;
} else if (buf_addr) {
/* User copy requires addr to be unset */
@@ -2245,10 +2272,9 @@ static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
return 0;
}
-static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
- struct ublk_device *ub = ubq->dev;
int ret = 0;
/*
@@ -2257,8 +2283,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
*/
mutex_lock(&ub->mutex);
- /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
- if (ublk_queue_ready(ubq)) {
+ /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
+ if (ublk_dev_ready(ub)) {
ret = -EBUSY;
goto out;
}
@@ -2272,28 +2298,28 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
if (ret)
goto out;
WRITE_ONCE(io->task, get_task_struct(current));
- ublk_mark_io_ready(ub, ubq);
+ ublk_mark_io_ready(ub);
out:
mutex_unlock(&ub->mutex);
return ret;
}
-static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
+static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
struct request *req = io->req;
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
- if (!buf_addr && (!ublk_need_get_data(ubq) ||
+ if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
req_op(req) == REQ_OP_READ))
return -EINVAL;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
@@ -2307,10 +2333,10 @@ static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
return 0;
}
-static bool ublk_need_complete_req(const struct ublk_queue *ubq,
+static bool ublk_need_complete_req(const struct ublk_device *ub,
struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq))
+ if (ublk_dev_need_req_ref(ub))
return ublk_sub_req_ref(io);
return true;
}
@@ -2333,23 +2359,28 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
return ublk_start_io(ubq, req, io);
}
-static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
- unsigned int issue_flags,
- const struct ublksrv_io_cmd *ub_cmd)
+static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
+ /* May point to userspace-mapped memory */
+ const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
- unsigned tag = ub_cmd->tag;
+ u16 q_id = READ_ONCE(ub_src->q_id);
+ u16 tag = READ_ONCE(ub_src->tag);
+ s32 result = READ_ONCE(ub_src->result);
+ u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
struct request *req;
int ret;
bool compl;
+ WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
+
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
- __func__, cmd->cmd_op, ub_cmd->q_id, tag,
- ub_cmd->result);
+ __func__, cmd->cmd_op, q_id, tag, result);
ret = ublk_check_cmd_op(cmd_op);
if (ret)
@@ -2360,25 +2391,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
- return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
- issue_flags);
+ return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
ret = -EINVAL;
- if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+ if (q_id >= ub->dev_info.nr_hw_queues)
goto out;
- ubq = ublk_get_queue(ub, ub_cmd->q_id);
+ ubq = ublk_get_queue(ub, q_id);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
goto out;
io = &ubq->ios[tag];
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
- ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
+ ret = ublk_check_fetch_buf(ub, addr);
if (ret)
goto out;
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ ret = ublk_fetch(cmd, ub, io, addr);
if (ret)
goto out;
@@ -2392,8 +2422,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so can be handled on any task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
- return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
- issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io,
+ addr, issue_flags);
goto out;
}
@@ -2414,24 +2444,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
+ ret = ublk_check_commit_and_fetch(ub, io, addr);
if (ret)
goto out;
- io->res = ub_cmd->result;
+ io->res = result;
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
- compl = ublk_need_complete_req(ubq, io);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
+ compl = ublk_need_complete_req(ub, io);
/* can't touch 'ublk_io' any more */
if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
+ req->__sector = addr;
if (compl)
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
if (ret)
goto out;
@@ -2443,7 +2473,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* request
*/
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
WARN_ON_ONCE(ret);
if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req);
@@ -2463,16 +2493,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset)
{
- unsigned tag = io - ubq->ios;
struct request *req;
/*
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
* which would overwrite it with io->cmd
*/
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
if (!req)
return NULL;
@@ -2494,26 +2523,6 @@ fail_put:
return NULL;
}
-static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- /*
- * Not necessary for async retry, but let's keep it simple and always
- * copy the values to avoid any potential reuse.
- */
- const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
- const struct ublksrv_io_cmd ub_cmd = {
- .q_id = READ_ONCE(ub_src->q_id),
- .tag = READ_ONCE(ub_src->tag),
- .result = READ_ONCE(ub_src->result),
- .addr = READ_ONCE(ub_src->addr)
- };
-
- WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
-
- return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
-}
-
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -2583,17 +2592,14 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
return ERR_PTR(-EINVAL);
ubq = ublk_get_queue(ub, q_id);
- if (!ubq)
- return ERR_PTR(-EINVAL);
-
- if (!ublk_support_user_copy(ubq))
+ if (!ublk_dev_support_user_copy(ub))
return ERR_PTR(-EACCES);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
return ERR_PTR(-EINVAL);
*io = &ubq->ios[tag];
- req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
+ req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off);
if (!req)
return ERR_PTR(-EINVAL);
@@ -2656,7 +2662,7 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub, q_id);
+ int size = ublk_queue_cmd_buf_size(ub);
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
int i;
@@ -2683,7 +2689,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
ubq->q_depth = ub->dev_info.queue_depth;
- size = ublk_queue_cmd_buf_size(ub, q_id);
+ size = ublk_queue_cmd_buf_size(ub);
ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
if (!ptr)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index e649fa67bac1..f061420dfb10 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -829,9 +829,9 @@ out:
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
-static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct virtio_blk *vblk = bd->bd_disk->private_data;
+ struct virtio_blk *vblk = disk->private_data;
int ret = 0;
mutex_lock(&vblk->vdev_mutex);
@@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
/* some standard values, similar to sd */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
}
out:
mutex_unlock(&vblk->vdev_mutex);
@@ -1682,7 +1682,7 @@ static int __init virtio_blk_init(void)
{
int error;
- virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+ virtblk_wq = alloc_workqueue("virtio-blk", WQ_PERCPU, 0);
if (!virtblk_wq)
return -ENOMEM;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5babe575c288..04fc6b552c04 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg)
schedule_work(&rinfo->work);
}
-static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg)
{
/* We don't have real geometry info, but let's at least return
values consistent with the size of the device */
- sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
hg->heads = 0xff;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f31652085adc..42809cf6dd82 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1085,7 +1085,7 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page,
work.entry = entry;
INIT_WORK_ONSTACK(&work.work, zram_sync_read);
- queue_work(system_unbound_wq, &work.work);
+ queue_work(system_dfl_wq, &work.work);
flush_work(&work.work);
destroy_work_on_stack(&work.work);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ddb37f6670de..07c19b2182ca 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -37,6 +37,32 @@ config BLK_DEV_MD
If unsure, say N.
+config MD_BITMAP
+ bool "MD RAID bitmap support"
+ default y
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the write intent bitmap will be
+ enabled. The bitmap can be used to optimize resync speed after power
+ failure or readding a disk, limiting it to recorded dirty sectors in
+ bitmap.
+
+ This feature can be added to existing MD array or MD array can be
+ created with bitmap via mdadm(8).
+
+ If unsure, say Y.
+
+config MD_LLBITMAP
+ bool "MD RAID lockless bitmap support"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the lockless write intent bitmap will
+ be enabled.
+
+ Note, this is an experimental feature.
+
+ If unsure, say N.
+
config MD_AUTODETECT
bool "Autodetect RAID arrays during kernel boot"
depends on BLK_DEV_MD=y
@@ -54,6 +80,7 @@ config MD_AUTODETECT
config MD_BITMAP_FILE
bool "MD bitmap file support (deprecated)"
default y
+ depends on MD_BITMAP
help
If you say Y here, support for write intent bitmaps in files on an
external file system is enabled. This is an alternative to the internal
@@ -174,6 +201,7 @@ config MD_RAID456
config MD_CLUSTER
tristate "Cluster Support for MD"
+ select MD_BITMAP
depends on BLK_DEV_MD
depends on DLM
default n
@@ -393,6 +421,7 @@ config DM_RAID
select MD_RAID1
select MD_RAID10
select MD_RAID456
+ select MD_BITMAP
select BLK_DEV_MD
help
A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 87bdfc9fe14c..5a51b3408b70 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
-md-mod-y += md.o md-bitmap.o
+md-mod-y += md.o
+md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o
+md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
linear-y += md-linear.o
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 7510d1c983a5..f327456fc4e0 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
check = bio_kmalloc(nr_segs, GFP_NOIO);
if (!check)
return;
- bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs,
- REQ_OP_READ);
+ bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ);
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 020712c5203f..2386d08bf4e4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
- meta_bucket_pages(&c->cache->sb), 0);
+ bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0);
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7ff14bd2feb8..d50eb82ccb4f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca)
atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
- bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
+ bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD);
bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
ca->sb.d[ja->discard_idx]);
bio->bi_iter.bi_size = bucket_bytes(ca);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 26a6a535ec32..73918e55bf04 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1492c8552255..6d250e366412 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca)
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
- bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
+ bio_init_inline(&ca->journal.bio, NULL, 8, 0);
/*
* When the cache disk is first registered, ca->sb.njournal_buckets
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 302e75f1fc4b..6ba73dc1a3df 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(struct_size(io, bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ff7595caf440..8f3a23f4b168 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
use_dmio(b, op, sector, n_sectors, offset, ioprio);
return;
}
- bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
+ bio_init_inline(bio, b->c->bdev, 1, op);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = bio_complete;
bio->bi_private = b;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index cf17fd46e255..08925aca838c 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
if (!clone)
return NULL;
- bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf);
+ bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf);
clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector);
clone->bi_private = bio;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f4b904e24328..0a1788fed68c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3955,9 +3955,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
struct mddev *mddev = &rs->md;
- r = mddev->bitmap_ops->load(mddev);
- if (r)
- DMERR("Failed to load bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->load(mddev);
+ if (r)
+ DMERR("Failed to load bitmap");
+ }
}
return r;
@@ -4072,10 +4074,12 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
- r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
- chunksize, false);
- if (r)
- DMERR("Failed to resize bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
+ chunksize);
+ if (r)
+ DMERR("Failed to resize bitmap");
+ }
}
/* Check for any resize/reshape on @rs and adjust/initiate */
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index e7f4153e55e3..8fc22fb14196 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t
return VDO_SUCCESS;
bio->bi_ioprio = 0;
- bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_io_vec = bio_inline_vecs(bio);
bio->bi_max_vecs = vio->block_count + 1;
if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
size, vio_size) != VDO_SUCCESS)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a44e8c2dccee..7bd6fa05b00a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w)
dm_deferred_remove();
}
-static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
+ struct mapped_device *md = disk->private_data;
return dm_get_geometry(md, geo);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 334b71404930..84b7e2af6dba 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -34,15 +34,6 @@
#include "md-bitmap.h"
#include "md-cluster.h"
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- * Version 5 is currently set only for clustered devices
- */
-#define BITMAP_MAJOR_HI 4
-#define BITMAP_MAJOR_CLUSTERED 5
-#define BITMAP_MAJOR_HOSTENDIAN 3
-
/*
* in-memory bitmap:
*
@@ -224,6 +215,8 @@ struct bitmap {
int cluster_slot;
};
+static struct workqueue_struct *md_bitmap_wq;
+
static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, bool init);
@@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap)
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
-static bool __bitmap_enabled(struct bitmap *bitmap)
-{
- return bitmap->storage.filemap &&
- !test_bit(BITMAP_STALE, &bitmap->flags);
-}
-
-static bool bitmap_enabled(struct mddev *mddev)
+static bool bitmap_enabled(void *data, bool flush)
{
- struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap *bitmap = data;
- if (!bitmap)
- return false;
+ if (!flush)
+ return true;
- return __bitmap_enabled(bitmap);
+ /*
+ * If caller want to flush bitmap pages to underlying disks, check if
+ * there are cached pages in filemap.
+ */
+ return !test_bit(BITMAP_STALE, &bitmap->flags) &&
+ bitmap->storage.filemap != NULL;
}
/*
@@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return -EINVAL;
}
- md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
+ md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit),
+ page, 0);
return 0;
}
@@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap)
int dirty, need_write;
int writing = 0;
- if (!__bitmap_enabled(bitmap))
+ if (!bitmap_enabled(bitmap, true))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
- bool rv;
+ bool rv = false;
- if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
- *blocks = 1024;
- return true; /* always resync if no bitmap */
- }
spin_lock_irq(&bitmap->counts.lock);
-
- rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc) {
/* locked */
@@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
bitmap_counter_t *bmc;
unsigned long flags;
- if (bitmap == NULL) {
- *blocks = 1024;
- return;
- }
spin_lock_irqsave(&bitmap->counts.lock, flags);
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc == NULL)
@@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev)
struct bitmap *bitmap = mddev->bitmap;
int bw;
- if (!bitmap)
- return;
-
atomic_inc(&bitmap->behind_writes);
bw = atomic_read(&bitmap->behind_writes);
if (bw > bitmap->behind_writes_used)
@@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
- if (!bitmap)
- return;
-
if (atomic_dec_and_test(&bitmap->behind_writes))
wake_up(&bitmap->behind_wait);
pr_debug("dec write-behind count %d/%lu\n",
@@ -2593,15 +2570,14 @@ err:
return ret;
}
-static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init)
+static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- return __bitmap_resize(bitmap, blocks, chunksize, init);
+ return __bitmap_resize(bitmap, blocks, chunksize, false);
}
static ssize_t
@@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = {
&max_backlog_used.attr,
NULL
};
-const struct attribute_group md_bitmap_group = {
+
+static struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
static struct bitmap_operations bitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_BITMAP,
+ .name = "bitmap",
+ },
+
.enabled = bitmap_enabled,
.create = bitmap_create,
.resize = bitmap_resize,
@@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = {
.start_write = bitmap_start_write,
.end_write = bitmap_end_write,
+ .start_discard = bitmap_start_write,
+ .end_discard = bitmap_end_write,
+
.start_sync = bitmap_start_sync,
.end_sync = bitmap_end_sync,
.cond_end_sync = bitmap_cond_end_sync,
@@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = {
.copy_from_slot = bitmap_copy_from_slot,
.set_pages = bitmap_set_pages,
.free = md_bitmap_free,
+
+ .group = &md_bitmap_group,
};
-void mddev_set_bitmap_ops(struct mddev *mddev)
+int md_bitmap_init(void)
+{
+ md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ 0);
+ if (!md_bitmap_wq)
+ return -ENOMEM;
+
+ return register_md_submodule(&bitmap_ops.head);
+}
+
+void md_bitmap_exit(void)
{
- mddev->bitmap_ops = &bitmap_ops;
+ destroy_workqueue(md_bitmap_wq);
+ unregister_md_submodule(&bitmap_ops.head);
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 59e9dd45cfde..b42a28fa83a0 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -9,10 +9,26 @@
#define BITMAP_MAGIC 0x6d746962
+/*
+ * version 3 is host-endian order, this is deprecated and not used for new
+ * array
+ */
+#define BITMAP_MAJOR_LO 3
+#define BITMAP_MAJOR_HOSTENDIAN 3
+/* version 4 is little-endian order, the default value */
+#define BITMAP_MAJOR_HI 4
+/* version 5 is only used for cluster */
+#define BITMAP_MAJOR_CLUSTERED 5
+/* version 6 is only used for lockless bitmap */
+#define BITMAP_MAJOR_LOCKLESS 6
+
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
- BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
+ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+ BITMAP_FIRST_USE = 3, /* llbitmap is just created */
+ BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */
+ BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */
BITMAP_HOSTENDIAN =15,
};
@@ -61,11 +77,15 @@ struct md_bitmap_stats {
struct file *file;
};
+typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors);
+
struct bitmap_operations {
- bool (*enabled)(struct mddev *mddev);
+ struct md_submodule_head head;
+
+ bool (*enabled)(void *data, bool flush);
int (*create)(struct mddev *mddev);
- int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init);
+ int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize);
int (*load)(struct mddev *mddev);
void (*destroy)(struct mddev *mddev);
@@ -80,10 +100,13 @@ struct bitmap_operations {
void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
- void (*start_write)(struct mddev *mddev, sector_t offset,
- unsigned long sectors);
- void (*end_write)(struct mddev *mddev, sector_t offset,
- unsigned long sectors);
+ md_bitmap_fn *start_write;
+ md_bitmap_fn *end_write;
+ md_bitmap_fn *start_discard;
+ md_bitmap_fn *end_discard;
+
+ sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset);
+ bool (*blocks_synced)(struct mddev *mddev, sector_t offset);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
@@ -101,9 +124,75 @@ struct bitmap_operations {
sector_t *hi, bool clear_bits);
void (*set_pages)(void *data, unsigned long pages);
void (*free)(void *data);
+
+ struct attribute_group *group;
};
/* the bitmap API */
-void mddev_set_bitmap_ops(struct mddev *mddev);
+static inline bool md_bitmap_registered(struct mddev *mddev)
+{
+ return mddev->bitmap_ops != NULL;
+}
+
+static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush)
+{
+ /* bitmap_ops must be registered before creating bitmap. */
+ if (!md_bitmap_registered(mddev))
+ return false;
+
+ if (!mddev->bitmap)
+ return false;
+
+ return mddev->bitmap_ops->enabled(mddev->bitmap, flush);
+}
+
+static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ /* always resync if no bitmap */
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return true;
+ }
+
+ return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded);
+}
+
+static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return;
+ }
+
+ mddev->bitmap_ops->end_sync(mddev, offset, blocks);
+}
+
+#ifdef CONFIG_MD_BITMAP
+int md_bitmap_init(void);
+void md_bitmap_exit(void);
+#else
+static inline int md_bitmap_init(void)
+{
+ return 0;
+}
+static inline void md_bitmap_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MD_LLBITMAP
+int md_llbitmap_init(void);
+void md_llbitmap_exit(void);
+#else
+static inline int md_llbitmap_init(void)
+{
+ return 0;
+}
+static inline void md_llbitmap_exit(void)
+{
+}
+#endif
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6e9a0045f0ff..11f1e91d387d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
ret = mddev->bitmap_ops->resize(mddev,
le64_to_cpu(msg->high),
- 0, false);
+ 0);
break;
default:
ret = -1;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 3e1f165c2d20..7033d982d377 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -257,18 +257,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
- struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, &mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio = bio_submit_split_bioset(bio, end_sector - bio_sector,
+ &mddev->bio_set);
+ if (!bio)
return true;
- }
-
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
}
md_account_bio(mddev, &bio);
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
new file mode 100644
index 000000000000..1eb434306162
--- /dev/null
+++ b/drivers/md/md-llbitmap.c
@@ -0,0 +1,1626 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <trace/events/block.h>
+
+#include "md.h"
+#include "md-bitmap.h"
+
+/*
+ * #### Background
+ *
+ * Redundant data is used to enhance data fault tolerance, and the storage
+ * methods for redundant data vary depending on the RAID levels. And it's
+ * important to maintain the consistency of redundant data.
+ *
+ * Bitmap is used to record which data blocks have been synchronized and which
+ * ones need to be resynchronized or recovered. Each bit in the bitmap
+ * represents a segment of data in the array. When a bit is set, it indicates
+ * that the multiple redundant copies of that data segment may not be
+ * consistent. Data synchronization can be performed based on the bitmap after
+ * power failure or readding a disk. If there is no bitmap, a full disk
+ * synchronization is required.
+ *
+ * #### Key Features
+ *
+ * - IO fastpath is lockless, if user issues lots of write IO to the same
+ * bitmap bit in a short time, only the first write has additional overhead
+ * to update bitmap bit, no additional overhead for the following writes;
+ * - support only resync or recover written data, means in the case creating
+ * new array or replacing with a new disk, there is no need to do a full disk
+ * resync/recovery;
+ *
+ * #### Key Concept
+ *
+ * ##### State Machine
+ *
+ * Each bit is one byte, contain 6 different states, see llbitmap_state. And
+ * there are total 8 different actions, see llbitmap_action, can change state:
+ *
+ * llbitmap state machine: transitions between states
+ *
+ * | | Startwrite | Startsync | Endsync | Abortsync|
+ * | --------- | ---------- | --------- | ------- | ------- |
+ * | Unwritten | Dirty | x | x | x |
+ * | Clean | Dirty | x | x | x |
+ * | Dirty | x | x | x | x |
+ * | NeedSync | x | Syncing | x | x |
+ * | Syncing | x | Syncing | Dirty | NeedSync |
+ *
+ * | | Reload | Daemon | Discard | Stale |
+ * | --------- | -------- | ------ | --------- | --------- |
+ * | Unwritten | x | x | x | x |
+ * | Clean | x | x | Unwritten | NeedSync |
+ * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
+ * | NeedSync | x | x | Unwritten | x |
+ * | Syncing | NeedSync | x | Unwritten | NeedSync |
+ *
+ * Typical scenarios:
+ *
+ * 1) Create new array
+ * All bits will be set to Unwritten by default, if --assume-clean is set,
+ * all bits will be set to Clean instead.
+ *
+ * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
+ * rely on xor data
+ *
+ * 2.1) write new data to raid1/raid10:
+ * Unwritten --StartWrite--> Dirty
+ *
+ * 2.2) write new data to raid456:
+ * Unwritten --StartWrite--> NeedSync
+ *
+ * Because the initial recover for raid456 is skipped, the xor data is not built
+ * yet, the bit must be set to NeedSync first and after lazy initial recover is
+ * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
+ *
+ * 2.3) cover write
+ * Clean --StartWrite--> Dirty
+ *
+ * 3) daemon, if the array is not degraded:
+ * Dirty --Daemon--> Clean
+ *
+ * 4) discard
+ * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
+ *
+ * 5) resync and recover
+ *
+ * 5.1) common process
+ * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
+ *
+ * 5.2) resync after power failure
+ * Dirty --Reload--> NeedSync
+ *
+ * 5.3) recover while replacing with a new disk
+ * By default, the old bitmap framework will recover all data, and llbitmap
+ * implements this by a new helper, see llbitmap_skip_sync_blocks:
+ *
+ * skip recover for bits other than dirty or clean;
+ *
+ * 5.4) lazy initial recover for raid5:
+ * By default, the old bitmap framework will only allow new recover when there
+ * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
+ * to perform raid456 lazy recover for set bits(from 2.2).
+ *
+ * 6. special handling for degraded array:
+ *
+ * - Dirty bits will never be cleared, daemon will just do nothing, so that if
+ * a disk is readded, Clean bits can be skipped with recovery;
+ * - Dirty bits will convert to Syncing from start write, to do data recovery
+ * for new added disks;
+ * - New write will convert bits to NeedSync directly;
+ *
+ * ##### Bitmap IO
+ *
+ * ##### Chunksize
+ *
+ * The default bitmap size is 128k, incluing 1k bitmap super block, and
+ * the default size of segment of data in the array each bit(chunksize) is 64k,
+ * and chunksize will adjust to twice the old size each time if the total number
+ * bits is not less than 127k.(see llbitmap_init)
+ *
+ * ##### READ
+ *
+ * While creating bitmap, all pages will be allocated and read for llbitmap,
+ * there won't be read afterwards
+ *
+ * ##### WRITE
+ *
+ * WRITE IO is divided into logical_block_size of the array, the dirty state
+ * of each block is tracked independently, for example:
+ *
+ * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
+ *
+ * | page0 | page1 | ... | page 31 |
+ * | |
+ * | \-----------------------\
+ * | |
+ * | block0 | block1 | ... | block 8|
+ * | |
+ * | \-----------------\
+ * | |
+ * | bit0 | bit1 | ... | bit511 |
+ *
+ * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
+ * subpage will be marked dirty, such block must write first before the IO is
+ * issued. This behaviour will affect IO performance, to reduce the impact, if
+ * multiple bits are changed in the same block in a short time, all bits in this
+ * block will be changed to Dirty/NeedSync, so that there won't be any overhead
+ * until daemon clears dirty bits.
+ *
+ * ##### Dirty Bits synchronization
+ *
+ * IO fast path will set bits to dirty, and those dirty bits will be cleared
+ * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
+ * IO path and daemon;
+ *
+ * IO path:
+ * 1) try to grab a reference, if succeed, set expire time after 5s and return;
+ * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
+ * bits;
+ *
+ * Daemon (Daemon will be woken up every daemon_sleep seconds):
+ * For each page:
+ * 1) check if page expired, if not skip this page; for expired page:
+ * 2) suspend the page and wait for inflight write IO to be done;
+ * 3) change dirty page to clean;
+ * 4) resume the page;
+ */
+
+#define BITMAP_DATA_OFFSET 1024
+
+/* 64k is the max IO size of sync IO for raid1/raid10 */
+#define MIN_CHUNK_SIZE (64 * 2)
+
+/* By default, daemon will be woken up every 30s */
+#define DEFAULT_DAEMON_SLEEP 30
+
+/*
+ * Dirtied bits that have not been accessed for more than 5s will be cleared
+ * by daemon.
+ */
+#define DEFAULT_BARRIER_IDLE 5
+
+enum llbitmap_state {
+ /* No valid data, init state after assemble the array */
+ BitUnwritten = 0,
+ /* data is consistent */
+ BitClean,
+ /* data will be consistent after IO is done, set directly for writes */
+ BitDirty,
+ /*
+ * data need to be resynchronized:
+ * 1) set directly for writes if array is degraded, prevent full disk
+ * synchronization after readding a disk;
+ * 2) reassemble the array after power failure, and dirty bits are
+ * found after reloading the bitmap;
+ * 3) set for first write for raid5, to build initial xor data lazily
+ */
+ BitNeedSync,
+ /* data is synchronizing */
+ BitSyncing,
+ BitStateCount,
+ BitNone = 0xff,
+};
+
+enum llbitmap_action {
+ /* User write new data, this is the only action from IO fast path */
+ BitmapActionStartwrite = 0,
+ /* Start recovery */
+ BitmapActionStartsync,
+ /* Finish recovery */
+ BitmapActionEndsync,
+ /* Failed recovery */
+ BitmapActionAbortsync,
+ /* Reassemble the array */
+ BitmapActionReload,
+ /* Daemon thread is trying to clear dirty bits */
+ BitmapActionDaemon,
+ /* Data is deleted */
+ BitmapActionDiscard,
+ /*
+ * Bitmap is stale, mark all bits in addition to BitUnwritten to
+ * BitNeedSync.
+ */
+ BitmapActionStale,
+ BitmapActionCount,
+ /* Init state is BitUnwritten */
+ BitmapActionInit,
+};
+
+enum llbitmap_page_state {
+ LLPageFlush = 0,
+ LLPageDirty,
+};
+
+struct llbitmap_page_ctl {
+ char *state;
+ struct page *page;
+ unsigned long expire;
+ unsigned long flags;
+ wait_queue_head_t wait;
+ struct percpu_ref active;
+ /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
+ unsigned long dirty[];
+};
+
+struct llbitmap {
+ struct mddev *mddev;
+ struct llbitmap_page_ctl **pctl;
+
+ unsigned int nr_pages;
+ unsigned int io_size;
+ unsigned int blocks_per_page;
+
+ /* shift of one chunk */
+ unsigned long chunkshift;
+ /* size of one chunk in sector */
+ unsigned long chunksize;
+ /* total number of chunks */
+ unsigned long chunks;
+ unsigned long last_end_sync;
+ /*
+ * time in seconds that dirty bits will be cleared if the page is not
+ * accessed.
+ */
+ unsigned long barrier_idle;
+ /* fires on first BitDirty state */
+ struct timer_list pending_timer;
+ struct work_struct daemon_work;
+
+ unsigned long flags;
+ __u64 events_cleared;
+
+ /* for slow disks */
+ atomic_t behind_writes;
+ wait_queue_head_t behind_wait;
+};
+
+struct llbitmap_unplug_work {
+ struct work_struct work;
+ struct llbitmap *llbitmap;
+ struct completion *done;
+};
+
+static struct workqueue_struct *md_llbitmap_io_wq;
+static struct workqueue_struct *md_llbitmap_unplug_wq;
+
+static char state_machine[BitStateCount][BitmapActionCount] = {
+ [BitUnwritten] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitNone,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitClean] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitDirty] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitClean,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitNeedSync] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitSyncing] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitDirty,
+ [BitmapActionAbortsync] = BitNeedSync,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+};
+
+static void __llbitmap_flush(struct mddev *mddev);
+
+static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
+{
+ unsigned int idx;
+ unsigned int offset;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ offset = offset_in_page(pos);
+
+ return llbitmap->pctl[idx]->state[offset];
+}
+
+/* set all the bits in the subpage as dirty */
+static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
+ struct llbitmap_page_ctl *pctl,
+ unsigned int block)
+{
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ unsigned int io_size = llbitmap->io_size;
+ int pos;
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ switch (pctl->state[pos]) {
+ case BitUnwritten:
+ pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
+ break;
+ case BitClean:
+ pctl->state[pos] = BitDirty;
+ break;
+ };
+ }
+}
+
+static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
+ int offset)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+ unsigned int io_size = llbitmap->io_size;
+ int block = offset / io_size;
+ int pos;
+
+ if (!test_bit(LLPageDirty, &pctl->flags))
+ set_bit(LLPageDirty, &pctl->flags);
+
+ /*
+ * For degraded array, dirty bits will never be cleared, and we must
+ * resync all the dirty bits, hence skip infect new dirty bits to
+ * prevent resync unnecessary data.
+ */
+ if (llbitmap->mddev->degraded) {
+ set_bit(block, pctl->dirty);
+ return;
+ }
+
+ /*
+ * The subpage usually contains a total of 512 bits. If any single bit
+ * within the subpage is marked as dirty, the entire sector will be
+ * written. To avoid impacting write performance, when multiple bits
+ * within the same sector are modified within llbitmap->barrier_idle,
+ * all bits in the sector will be collectively marked as dirty at once.
+ */
+ if (test_and_set_bit(block, pctl->dirty)) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ if (pos == offset)
+ continue;
+ if (pctl->state[pos] == BitDirty ||
+ pctl->state[pos] == BitNeedSync) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+ }
+}
+
+static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
+ loff_t pos)
+{
+ unsigned int idx;
+ unsigned int bit;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ bit = offset_in_page(pos);
+
+ llbitmap->pctl[idx]->state[bit] = state;
+ if (state == BitDirty || state == BitNeedSync)
+ llbitmap_set_page_dirty(llbitmap, idx, bit);
+}
+
+static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *page = NULL;
+ struct md_rdev *rdev;
+
+ if (llbitmap->pctl && llbitmap->pctl[idx])
+ page = llbitmap->pctl[idx]->page;
+ if (page)
+ return page;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset +
+ (idx << PAGE_SECTORS_SHIFT);
+
+ if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
+ true))
+ return page;
+
+ md_error(mddev, rdev);
+ }
+
+ __free_page(page);
+ return ERR_PTR(-EIO);
+}
+
+static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
+{
+ struct page *page = llbitmap->pctl[idx]->page;
+ struct mddev *mddev = llbitmap->mddev;
+ struct md_rdev *rdev;
+ int block;
+
+ for (block = 0; block < llbitmap->blocks_per_page; block++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (!test_and_clear_bit(block, pctl->dirty))
+ continue;
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+ sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset + rdev->sb_start +
+ (idx << PAGE_SECTORS_SHIFT) +
+ block * bit_sector;
+ md_write_metadata(mddev, rdev, sector,
+ llbitmap->io_size, page,
+ block * llbitmap->io_size);
+ }
+ }
+}
+
+static void active_release(struct percpu_ref *ref)
+{
+ struct llbitmap_page_ctl *pctl =
+ container_of(ref, struct llbitmap_page_ctl, active);
+
+ wake_up(&pctl->wait);
+}
+
+static void llbitmap_free_pages(struct llbitmap *llbitmap)
+{
+ int i;
+
+ if (!llbitmap->pctl)
+ return;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ if (!pctl || !pctl->page)
+ break;
+
+ __free_page(pctl->page);
+ percpu_ref_exit(&pctl->active);
+ }
+
+ kfree(llbitmap->pctl[0]);
+ kfree(llbitmap->pctl);
+ llbitmap->pctl = NULL;
+}
+
+static int llbitmap_cache_pages(struct llbitmap *llbitmap)
+{
+ struct llbitmap_page_ctl *pctl;
+ unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
+ BITMAP_DATA_OFFSET, PAGE_SIZE);
+ unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
+ llbitmap->blocks_per_page));
+ int i;
+
+ llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!llbitmap->pctl)
+ return -ENOMEM;
+
+ size = round_up(size, cache_line_size());
+ pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
+ if (!pctl) {
+ kfree(llbitmap->pctl);
+ return -ENOMEM;
+ }
+
+ llbitmap->nr_pages = nr_pages;
+
+ for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
+ struct page *page = llbitmap_read_page(llbitmap, i);
+
+ llbitmap->pctl[i] = pctl;
+
+ if (IS_ERR(page)) {
+ llbitmap_free_pages(llbitmap);
+ return PTR_ERR(page);
+ }
+
+ if (percpu_ref_init(&pctl->active, active_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ __free_page(page);
+ llbitmap_free_pages(llbitmap);
+ return -ENOMEM;
+ }
+
+ pctl->page = page;
+ pctl->state = page_address(page);
+ init_waitqueue_head(&pctl->wait);
+ }
+
+ return 0;
+}
+
+static void llbitmap_init_state(struct llbitmap *llbitmap)
+{
+ enum llbitmap_state state = BitUnwritten;
+ unsigned long i;
+
+ if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+ state = BitClean;
+
+ for (i = 0; i < llbitmap->chunks; i++)
+ llbitmap_write(llbitmap, state, i);
+}
+
+/* The return value is only used from resync, where @start == @end. */
+static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
+ unsigned long start,
+ unsigned long end,
+ enum llbitmap_action action)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ enum llbitmap_state state = BitNone;
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ bool need_resync = false;
+ bool need_recovery = false;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return BitNone;
+
+ if (action == BitmapActionInit) {
+ llbitmap_init_state(llbitmap);
+ return BitNone;
+ }
+
+ while (start <= end) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount) {
+ pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
+ __func__, start, c, action);
+ state = BitNeedSync;
+ goto write_bitmap;
+ }
+
+ if (c == BitNeedSync)
+ need_resync = !mddev->degraded;
+
+ state = state_machine[c][action];
+
+write_bitmap:
+ if (unlikely(mddev->degraded)) {
+ /* For degraded array, mark new data as need sync. */
+ if (state == BitDirty &&
+ action == BitmapActionStartwrite)
+ state = BitNeedSync;
+ /*
+ * For degraded array, resync dirty data as well, noted
+ * if array is still degraded after resync is done, all
+ * new data will still be dirty until array is clean.
+ */
+ else if (c == BitDirty &&
+ action == BitmapActionStartsync)
+ state = BitSyncing;
+ } else if (c == BitUnwritten && state == BitDirty &&
+ action == BitmapActionStartwrite && level_456) {
+ /* Delay raid456 initial recovery to first write. */
+ state = BitNeedSync;
+ }
+
+ if (state == BitNone) {
+ start++;
+ continue;
+ }
+
+ llbitmap_write(llbitmap, state, start);
+
+ if (state == BitNeedSync)
+ need_resync = !mddev->degraded;
+ else if (state == BitDirty &&
+ !timer_pending(&llbitmap->pending_timer))
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + mddev->bitmap_info.daemon_sleep * HZ);
+
+ start++;
+ }
+
+ if (need_resync && level_456)
+ need_recovery = true;
+
+ if (need_recovery) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ } else if (need_resync) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+
+ return state;
+}
+
+static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+retry:
+ if (likely(percpu_ref_tryget_live(&pctl->active))) {
+ WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
+ return;
+ }
+
+ wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
+ goto retry;
+}
+
+static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_put(&pctl->active);
+}
+
+static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_kill(&pctl->active);
+
+ if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
+ llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ pctl->expire = LONG_MAX;
+ percpu_ref_resurrect(&pctl->active);
+ wake_up(&pctl->wait);
+}
+
+static int llbitmap_check_support(struct mddev *mddev)
+{
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
+ mdname(mddev));
+ return -EBUSY;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ if (mddev->bitmap_info.default_space == 0) {
+ pr_notice("md/llbitmap: %s: no space for bitmap\n",
+ mdname(mddev));
+ return -ENOSPC;
+ }
+ }
+
+ if (!mddev->persistent) {
+ pr_notice("md/llbitmap: %s: array must be persistent\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.file) {
+ pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.external) {
+ pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev_is_dm(mddev)) {
+ pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int llbitmap_init(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ sector_t blocks = mddev->resync_max_sectors;
+ unsigned long chunksize = MIN_CHUNK_SIZE;
+ unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
+ unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
+ int ret;
+
+ while (chunks > space) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+ mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
+
+ ret = llbitmap_cache_pages(llbitmap);
+ if (ret)
+ return ret;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionInit);
+ /* flush initial llbitmap to disk */
+ __llbitmap_flush(mddev);
+
+ return 0;
+}
+
+static int llbitmap_read_sb(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ unsigned long daemon_sleep;
+ unsigned long chunksize;
+ unsigned long events;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+ int ret = -EINVAL;
+
+ if (!mddev->bitmap_info.offset) {
+ pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
+ return -EINVAL;
+ }
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("md/llbitmap: %s: read super block failed",
+ mdname(mddev));
+ return -EIO;
+ }
+
+ sb = kmap_local_page(sb_page);
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+ pr_err("md/llbitmap: %s: invalid super block magic number",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
+ pr_err("md/llbitmap: %s: invalid super block version",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (memcmp(sb->uuid, mddev->uuid, 16)) {
+ pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ int room = le32_to_cpu(sb->sectors_reserved);
+
+ if (room)
+ mddev->bitmap_info.space = room;
+ else
+ mddev->bitmap_info.space = mddev->bitmap_info.default_space;
+ }
+ llbitmap->flags = le32_to_cpu(sb->state);
+ if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
+ ret = llbitmap_init(llbitmap);
+ goto out_put_page;
+ }
+
+ chunksize = le32_to_cpu(sb->chunksize);
+ if (!is_power_of_2(chunksize)) {
+ pr_err("md/llbitmap: %s: chunksize not a power of 2",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
+ mddev->bitmap_info.space << SECTOR_SHIFT)) {
+ pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
+ mdname(mddev), chunksize, mddev->resync_max_sectors,
+ mddev->bitmap_info.space);
+ goto out_put_page;
+ }
+
+ daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+ if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
+ pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
+ mdname(mddev), daemon_sleep);
+ goto out_put_page;
+ }
+
+ events = le64_to_cpu(sb->events);
+ if (events < mddev->events) {
+ pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
+ mdname(mddev), events, mddev->events);
+ set_bit(BITMAP_STALE, &llbitmap->flags);
+ }
+
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ mddev->bitmap_info.chunksize = chunksize;
+ mddev->bitmap_info.daemon_sleep = daemon_sleep;
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
+ llbitmap->chunkshift = ffz(~chunksize);
+ ret = llbitmap_cache_pages(llbitmap);
+
+out_put_page:
+ __free_page(sb_page);
+ kunmap_local(sb);
+ return ret;
+}
+
+static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
+{
+ struct llbitmap *llbitmap =
+ container_of(pending_timer, struct llbitmap, pending_timer);
+
+ if (work_busy(&llbitmap->daemon_work)) {
+ pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
+ mdname(llbitmap->mddev),
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
+ return;
+ }
+
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+}
+
+static void md_llbitmap_daemon_fn(struct work_struct *work)
+{
+ struct llbitmap *llbitmap =
+ container_of(work, struct llbitmap, daemon_work);
+ unsigned long start;
+ unsigned long end;
+ bool restart;
+ int idx;
+
+ if (llbitmap->mddev->degraded)
+ return;
+retry:
+ start = 0;
+ end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
+ restart = false;
+
+ for (idx = 0; idx < llbitmap->nr_pages; idx++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (idx > 0) {
+ start = end + 1;
+ end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
+ }
+
+ if (!test_bit(LLPageFlush, &pctl->flags) &&
+ time_before(jiffies, pctl->expire)) {
+ restart = true;
+ continue;
+ }
+
+ if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
+ pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
+ mdname(llbitmap->mddev), __func__, idx);
+ continue;
+ }
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
+ llbitmap_resume(llbitmap, idx);
+ }
+
+ /*
+ * If the daemon took a long time to finish, retry to prevent missing
+ * clearing dirty bits.
+ */
+ if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
+ goto retry;
+
+ /* If some page is dirty but not expired, setup timer again */
+ if (restart)
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
+}
+
+static int llbitmap_create(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap;
+ int ret;
+
+ ret = llbitmap_check_support(mddev);
+ if (ret)
+ return ret;
+
+ llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
+ if (!llbitmap)
+ return -ENOMEM;
+
+ llbitmap->mddev = mddev;
+ llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
+ llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
+
+ timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
+ INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
+ atomic_set(&llbitmap->behind_writes, 0);
+ init_waitqueue_head(&llbitmap->behind_wait);
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ mddev->bitmap = llbitmap;
+ ret = llbitmap_read_sb(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ if (ret) {
+ kfree(llbitmap);
+ mddev->bitmap = NULL;
+ }
+
+ return ret;
+}
+
+static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long chunks;
+
+ if (chunksize == 0)
+ chunksize = llbitmap->chunksize;
+
+ /* If there is enough space, leave the chunksize unchanged. */
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+
+ return 0;
+}
+
+static int llbitmap_load(struct mddev *mddev)
+{
+ enum llbitmap_action action = BitmapActionReload;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
+ action = BitmapActionStale;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
+ return 0;
+}
+
+static void llbitmap_destroy(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ flush_workqueue(md_llbitmap_io_wq);
+ flush_workqueue(md_llbitmap_unplug_wq);
+
+ mddev->bitmap = NULL;
+ llbitmap_free_pages(llbitmap);
+ kfree(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+}
+
+static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_unplug_fn(struct work_struct *work)
+{
+ struct llbitmap_unplug_work *unplug_work =
+ container_of(work, struct llbitmap_unplug_work, work);
+ struct llbitmap *llbitmap = unplug_work->llbitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
+ !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ continue;
+
+ llbitmap_write_page(llbitmap, i);
+ }
+
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+ complete(unplug_work->done);
+}
+
+static bool llbitmap_dirty(struct llbitmap *llbitmap)
+{
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ return true;
+
+ return false;
+}
+
+static void llbitmap_unplug(struct mddev *mddev, bool sync)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct llbitmap_unplug_work unplug_work = {
+ .llbitmap = llbitmap,
+ .done = &done,
+ };
+
+ if (!llbitmap_dirty(llbitmap))
+ return;
+
+ /*
+ * Issue new bitmap IO under submit_bio() context will deadlock:
+ * - the bio will wait for bitmap bio to be done, before it can be
+ * issued;
+ * - bitmap bio will be added to current->bio_list and wait for this
+ * bio to be issued;
+ */
+ INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
+ queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
+}
+
+/*
+ * Force to write all bitmap pages to disk, called when stopping the array, or
+ * every daemon_sleep seconds when sync_thread is running.
+ */
+static void __llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* mark all blocks as dirty */
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ llbitmap_write_page(llbitmap, i);
+ }
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+ flush_work(&llbitmap->daemon_work);
+
+ __llbitmap_flush(mddev);
+}
+
+/* This is used for raid5 lazy initial recovery */
+static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ return c == BitClean || c == BitDirty;
+}
+
+static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ /* always skip unwritten blocks */
+ if (c == BitUnwritten)
+ return blocks;
+
+ /* For degraded array, don't skip */
+ if (mddev->degraded)
+ return 0;
+
+ /* For resync also skip clean/dirty blocks */
+ if ((c == BitClean || c == BitDirty) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ return blocks;
+
+ return 0;
+}
+
+static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ /*
+ * Handle one bit at a time, this is much simpler. And it doesn't matter
+ * if md_do_sync() loop more times.
+ */
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ return llbitmap_state_machine(llbitmap, p, p,
+ BitmapActionStartsync) == BitSyncing;
+}
+
+/* Something is wrong, sync_thread stop at @offset */
+static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
+ BitmapActionAbortsync);
+}
+
+/* A full sync_thread is finished */
+static void llbitmap_close_sync(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* let daemon_fn clear dirty bits immediately */
+ WRITE_ONCE(pctl->expire, jiffies);
+ }
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionEndsync);
+}
+
+/*
+ * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
+ * just in case sync_thread have to restart after power failure.
+ */
+static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
+ bool force)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (sector == 0) {
+ llbitmap->last_end_sync = jiffies;
+ return;
+ }
+
+ if (time_before(jiffies, llbitmap->last_end_sync +
+ HZ * mddev->bitmap_info.daemon_sleep))
+ return;
+
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ mddev->curr_resync_completed = sector;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
+ BitmapActionEndsync);
+ __llbitmap_flush(mddev);
+
+ llbitmap->last_end_sync = jiffies;
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
+}
+
+static bool llbitmap_enabled(void *data, bool flush)
+{
+ struct llbitmap *llbitmap = data;
+
+ return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+}
+
+static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
+ unsigned long e)
+{
+ llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
+}
+
+static void llbitmap_write_sb(struct llbitmap *llbitmap)
+{
+ int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
+
+ bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
+ llbitmap_write_page(llbitmap, 0);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_update_sb(void *data)
+{
+ struct llbitmap *llbitmap = data;
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return;
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("%s: %s: read super block failed", __func__,
+ mdname(mddev));
+ set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+ return;
+ }
+
+ if (mddev->events < llbitmap->events_cleared)
+ llbitmap->events_cleared = mddev->events;
+
+ sb = kmap_local_page(sb_page);
+ sb->events = cpu_to_le64(mddev->events);
+ sb->state = cpu_to_le32(llbitmap->flags);
+ sb->chunksize = cpu_to_le32(llbitmap->chunksize);
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
+ sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
+ sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
+
+ kunmap_local(sb);
+ llbitmap_write_sb(llbitmap);
+}
+
+static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
+{
+ struct llbitmap *llbitmap = data;
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->missing_pages = 0;
+ stats->pages = llbitmap->nr_pages;
+ stats->file_pages = llbitmap->nr_pages;
+
+ stats->behind_writes = atomic_read(&llbitmap->behind_writes);
+ stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
+ stats->events_cleared = llbitmap->events_cleared;
+
+ return 0;
+}
+
+/* just flag all pages as needing to be written */
+static void llbitmap_write_all(struct mddev *mddev)
+{
+ int i;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ }
+}
+
+static void llbitmap_start_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ atomic_inc(&llbitmap->behind_writes);
+}
+
+static void llbitmap_end_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (atomic_dec_and_test(&llbitmap->behind_writes))
+ wake_up(&llbitmap->behind_wait);
+}
+
+static void llbitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ wait_event(llbitmap->behind_wait,
+ atomic_read(&llbitmap->behind_writes) == 0);
+
+}
+
+static ssize_t bits_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ int bits[BitStateCount] = {0};
+ loff_t start = 0;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap || !llbitmap->pctl) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "bitmap io error\n");
+ }
+
+ while (start < llbitmap->chunks) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount)
+ pr_err("%s: invalid bit %llu state %d\n",
+ __func__, start, c);
+ else
+ bits[c]++;
+ start++;
+ }
+
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+ bits[BitUnwritten], bits[BitClean], bits[BitDirty],
+ bits[BitNeedSync], bits[BitSyncing]);
+}
+
+static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
+
+static ssize_t metadata_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ ssize_t ret;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
+ llbitmap->chunksize, llbitmap->chunkshift,
+ llbitmap->chunks, mddev->bitmap_info.offset,
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+
+ return ret;
+}
+
+static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
+
+static ssize_t
+daemon_sleep_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
+}
+
+static ssize_t
+daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ mddev->bitmap_info.daemon_sleep = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
+
+static ssize_t
+barrier_idle_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ return sprintf(page, "%lu\n", llbitmap->barrier_idle);
+}
+
+static ssize_t
+barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ llbitmap->barrier_idle = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
+
+static struct attribute *md_llbitmap_attrs[] = {
+ &llbitmap_bits.attr,
+ &llbitmap_metadata.attr,
+ &llbitmap_daemon_sleep.attr,
+ &llbitmap_barrier_idle.attr,
+ NULL
+};
+
+static struct attribute_group md_llbitmap_group = {
+ .name = "llbitmap",
+ .attrs = md_llbitmap_attrs,
+};
+
+static struct bitmap_operations llbitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_LLBITMAP,
+ .name = "llbitmap",
+ },
+
+ .enabled = llbitmap_enabled,
+ .create = llbitmap_create,
+ .resize = llbitmap_resize,
+ .load = llbitmap_load,
+ .destroy = llbitmap_destroy,
+
+ .start_write = llbitmap_start_write,
+ .end_write = llbitmap_end_write,
+ .start_discard = llbitmap_start_discard,
+ .end_discard = llbitmap_end_discard,
+ .unplug = llbitmap_unplug,
+ .flush = llbitmap_flush,
+
+ .start_behind_write = llbitmap_start_behind_write,
+ .end_behind_write = llbitmap_end_behind_write,
+ .wait_behind_writes = llbitmap_wait_behind_writes,
+
+ .blocks_synced = llbitmap_blocks_synced,
+ .skip_sync_blocks = llbitmap_skip_sync_blocks,
+ .start_sync = llbitmap_start_sync,
+ .end_sync = llbitmap_end_sync,
+ .close_sync = llbitmap_close_sync,
+ .cond_end_sync = llbitmap_cond_end_sync,
+
+ .update_sb = llbitmap_update_sb,
+ .get_stats = llbitmap_get_stats,
+ .dirty_bits = llbitmap_dirty_bits,
+ .write_all = llbitmap_write_all,
+
+ .group = &md_llbitmap_group,
+};
+
+int md_llbitmap_init(void)
+{
+ md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_io_wq)
+ return -ENOMEM;
+
+ md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_unplug_wq) {
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ return -ENOMEM;
+ }
+
+ return register_md_submodule(&llbitmap_ops.head);
+}
+
+void md_llbitmap_exit(void)
+{
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ destroy_workqueue(md_llbitmap_unplug_wq);
+ md_llbitmap_unplug_wq = NULL;
+ unregister_md_submodule(&llbitmap_ops.head);
+}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e033c26fdd4..41c476b40c7a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq;
* workqueue whith reconfig_mutex grabbed.
*/
static struct workqueue_struct *md_misc_wq;
-struct workqueue_struct *md_bitmap_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
@@ -677,8 +676,64 @@ static void active_io_release(struct percpu_ref *ref)
static void no_op(struct percpu_ref *r) {}
+static bool mddev_set_bitmap_ops(struct mddev *mddev)
+{
+ struct bitmap_operations *old = mddev->bitmap_ops;
+ struct md_submodule_head *head;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE ||
+ (old && old->head.id == mddev->bitmap_id))
+ return true;
+
+ xa_lock(&md_submodule);
+ head = xa_load(&md_submodule, mddev->bitmap_id);
+
+ if (!head) {
+ pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ if (head->type != MD_BITMAP) {
+ pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ mddev->bitmap_ops = (void *)head;
+ xa_unlock(&md_submodule);
+
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) {
+ if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group))
+ pr_warn("md: cannot register extra bitmap attributes for %s\n",
+ mdname(mddev));
+ else
+ /*
+ * Inform user with KOBJ_CHANGE about new bitmap
+ * attributes.
+ */
+ kobject_uevent(&mddev->kobj, KOBJ_CHANGE);
+ }
+ return true;
+
+err:
+ xa_unlock(&md_submodule);
+ return false;
+}
+
+static void mddev_clear_bitmap_ops(struct mddev *mddev)
+{
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops &&
+ mddev->bitmap_ops->group)
+ sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group);
+
+ mddev->bitmap_ops = NULL;
+}
+
int mddev_init(struct mddev *mddev)
{
+ if (!IS_ENABLED(CONFIG_MD_BITMAP))
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ else
+ mddev->bitmap_id = ID_BITMAP;
if (percpu_ref_init(&mddev->active_io, active_io_release,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
@@ -713,7 +768,6 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
- mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
@@ -1020,15 +1074,26 @@ static void super_written(struct bio *bio)
wake_up(&mddev->sb_wait);
}
-void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page)
+/**
+ * md_write_metadata - write metadata to underlying disk, including
+ * array superblock, badblocks, bitmap superblock and bitmap bits.
+ * @mddev: the array to write
+ * @rdev: the underlying disk to write
+ * @sector: the offset to @rdev
+ * @size: the length of the metadata
+ * @page: the metadata
+ * @offset: the offset to @page
+ *
+ * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment
+ * mddev->pending_writes before returning, and decrement it on completion,
+ * waking up sb_wait. Caller must call md_super_wait() after issuing io to all
+ * rdev. If an error occurred, md_error() will be called, and the @rdev will be
+ * kicked out from @mddev.
+ */
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset)
{
- /* write first size bytes of page to sector of rdev
- * Increment mddev->pending_writes before returning
- * and decrement it on completion, waking up sb_wait
- * if zero is reached.
- * If an error occurred, call md_error
- */
struct bio *bio;
if (!page)
@@ -1046,7 +1111,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector;
- __bio_add_page(bio, page, size, 0);
+ __bio_add_page(bio, page, size, offset);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
@@ -1356,6 +1421,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev)
struct md_bitmap_stats stats;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return 0;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return 0;
@@ -1653,8 +1721,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
}
@@ -2302,8 +2370,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sb->super_offset = cpu_to_le64(rdev->sb_start);
sb->sb_csum = calc_sb_1_csum(sb);
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
@@ -2313,13 +2381,15 @@ static int
super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
+ struct mddev *mddev = rdev->mddev;
+
/* All necessary checks on new >= old have been done */
if (new_offset >= rdev->data_offset)
return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */
- if (rdev->mddev->minor_version == 0)
+ if (mddev->minor_version == 0)
return 1;
/* otherwise we must be sure not to step on
@@ -2331,8 +2401,7 @@ super_1_allow_new_offset(struct md_rdev *rdev,
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
- if (!rdev->mddev->bitmap_info.file) {
- struct mddev *mddev = rdev->mddev;
+ if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) {
struct md_bitmap_stats stats;
int err;
@@ -2804,24 +2873,24 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
- mddev->bitmap_ops->update_sb(mddev->bitmap);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
if (!test_bit(Faulty, &rdev->flags)) {
- md_super_write(mddev,rdev,
- rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
pr_debug("md: (write) %pg's sb offset: %llu\n",
rdev->bdev,
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
if (rdev->badblocks.size) {
- md_super_write(mddev, rdev,
- rdev->badblocks.sector,
- rdev->badblocks.size << 9,
- rdev->bb_page);
+ md_write_metadata(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page, 0);
rdev->badblocks.size = 0;
}
@@ -4150,6 +4219,86 @@ static struct md_sysfs_entry md_new_level =
__ATTR(new_level, 0664, new_level_show, new_level_store);
static ssize_t
+bitmap_type_show(struct mddev *mddev, char *page)
+{
+ struct md_submodule_head *head;
+ unsigned long i;
+ ssize_t len = 0;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ len += sprintf(page + len, "[none] ");
+ else
+ len += sprintf(page + len, "none ");
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_BITMAP)
+ continue;
+
+ if (mddev->bitmap_id == head->id)
+ len += sprintf(page + len, "[%s] ", head->name);
+ else
+ len += sprintf(page + len, "%s ", head->name);
+ }
+ xa_unlock(&md_submodule);
+
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+static ssize_t
+bitmap_type_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct md_submodule_head *head;
+ enum md_submodule_id id;
+ unsigned long i;
+ int err = 0;
+
+ xa_lock(&md_submodule);
+
+ if (mddev->bitmap_ops) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (cmd_match(buf, "none")) {
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ goto out;
+ }
+
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type == MD_BITMAP && cmd_match(buf, head->name)) {
+ mddev->bitmap_id = head->id;
+ goto out;
+ }
+ }
+
+ err = kstrtoint(buf, 10, &id);
+ if (err)
+ goto out;
+
+ if (id == ID_BITMAP_NONE) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ head = xa_load(&md_submodule, id);
+ if (head && head->type == MD_BITMAP) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ err = -ENOENT;
+
+out:
+ xa_unlock(&md_submodule);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_bitmap_type =
+__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store);
+
+static ssize_t
layout_show(struct mddev *mddev, char *page)
{
/* just a number, not meaningful for all levels */
@@ -4680,6 +4829,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
unsigned long chunk, end_chunk;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return len;
+
err = mddev_lock(mddev);
if (err)
return err;
@@ -5752,6 +5904,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_new_level.attr,
+ &md_bitmap_type.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -5801,7 +5954,6 @@ static const struct attribute_group md_redundancy_group = {
static const struct attribute_group *md_attr_groups[] = {
&md_default_group,
- &md_bitmap_group,
NULL,
};
@@ -6133,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
+static int md_bitmap_create(struct mddev *mddev)
+{
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ return -EINVAL;
+
+ if (!mddev_set_bitmap_ops(mddev))
+ return -ENOENT;
+
+ return mddev->bitmap_ops->create(mddev);
+}
+
+static void md_bitmap_destroy(struct mddev *mddev)
+{
+ if (!md_bitmap_registered(mddev))
+ return;
+
+ mddev->bitmap_ops->destroy(mddev);
+ mddev_clear_bitmap_ops(mddev);
+}
+
int md_run(struct mddev *mddev)
{
int err;
@@ -6299,7 +6471,7 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- err = mddev->bitmap_ops->create(mddev);
+ err = md_bitmap_create(mddev);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
@@ -6372,7 +6544,7 @@ bitmap_abort:
pers->free(mddev, mddev->private);
mddev->private = NULL;
put_pers(pers);
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
@@ -6392,10 +6564,12 @@ int do_md_run(struct mddev *mddev)
if (err)
goto out;
- err = mddev->bitmap_ops->load(mddev);
- if (err) {
- mddev->bitmap_ops->destroy(mddev);
- goto out;
+ if (md_bitmap_registered(mddev)) {
+ err = mddev->bitmap_ops->load(mddev);
+ if (err) {
+ md_bitmap_destroy(mddev);
+ goto out;
+ }
}
if (mddev_is_clustered(mddev))
@@ -6546,7 +6720,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0);
}
- mddev->bitmap_ops->flush(mddev);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6573,7 +6748,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- mddev->bitmap_ops->wait_behind_writes(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6589,7 +6765,7 @@ static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -7307,6 +7483,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
{
int err = 0;
+ if (!md_bitmap_registered(mddev))
+ return -EINVAL;
+
if (mddev->pers) {
if (!mddev->pers->quiesce || !mddev->thread)
return -EBUSY;
@@ -7363,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- err = mddev->bitmap_ops->create(mddev);
+ err = md_bitmap_create(mddev);
if (!err)
err = mddev->bitmap_ops->load(mddev);
if (err) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
}
}
@@ -7679,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- rv = mddev->bitmap_ops->create(mddev);
+ rv = md_bitmap_create(mddev);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
} else {
struct md_bitmap_stats stats;
@@ -7710,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
put_cluster_ops(mddev);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7747,9 +7926,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
-static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mddev *mddev = bdev->bd_disk->private_data;
+ struct mddev *mddev = disk->private_data;
geo->heads = 2;
geo->sectors = 4;
@@ -8491,6 +8670,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
unsigned long chunk_kb;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return;
@@ -8873,18 +9055,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio);
static void md_bitmap_start(struct mddev *mddev,
struct md_io_clone *md_io_clone)
{
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->start_discard :
+ mddev->bitmap_ops->start_write;
+
if (mddev->pers->bitmap_sector)
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
&md_io_clone->sectors);
- mddev->bitmap_ops->start_write(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
}
static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
{
- mddev->bitmap_ops->end_write(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->end_discard :
+ mddev->bitmap_ops->end_write;
+
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
}
static void md_end_clone_io(struct bio *bio)
@@ -8893,7 +9081,7 @@ static void md_end_clone_io(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
- if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
md_bitmap_end(mddev, md_io_clone);
if (bio->bi_status && !orig_bio->bi_status)
@@ -8920,9 +9108,10 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
- if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) {
md_io_clone->offset = (*bio)->bi_iter.bi_sector;
md_io_clone->sectors = bio_sectors(*bio);
+ md_io_clone->rw = op_stat_group(bio_op(*bio));
md_bitmap_start(mddev, md_io_clone);
}
@@ -8944,7 +9133,7 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
- if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
md_bitmap_end(mddev, md_io_clone);
if (bio->bi_status && !orig_bio->bi_status)
@@ -9010,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev,
}
}
+/*
+ * If lazy recovery is requested and all rdevs are in sync, select the rdev with
+ * the higest index to perfore recovery to build initial xor data, this is the
+ * same as old bitmap.
+ */
+static bool mddev_select_lazy_recover_rdev(struct mddev *mddev)
+{
+ struct md_rdev *recover_rdev = NULL;
+ struct md_rdev *rdev;
+ bool ret = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->raid_disk < 0)
+ continue;
+
+ if (test_bit(Faulty, &rdev->flags) ||
+ !test_bit(In_sync, &rdev->flags))
+ break;
+
+ if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk)
+ recover_rdev = rdev;
+ }
+
+ if (recover_rdev) {
+ clear_bit(In_sync, &recover_rdev->flags);
+ ret = true;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
{
sector_t start = 0;
@@ -9041,6 +9263,14 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
start = rdev->recovery_offset;
rcu_read_unlock();
+ /*
+ * If there are no spares, and raid456 lazy initial recover is
+ * requested.
+ */
+ if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) &&
+ start == MaxSector && mddev_select_lazy_recover_rdev(mddev))
+ start = 0;
+
/* If there is a bitmap, we need to make sure all
* writes that started before we added a spare
* complete before we start doing a recovery.
@@ -9061,19 +9291,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
static bool sync_io_within_limit(struct mddev *mddev)
{
- int io_sectors;
-
/*
* For raid456, sync IO is stripe(4k) per IO, for other levels, it's
* RESYNC_PAGES(64k) per IO.
*/
- if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
- io_sectors = 8;
- else
- io_sectors = 128;
-
return atomic_read(&mddev->recovery_active) <
- io_sectors * sync_io_depth(mddev);
+ (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
}
#define SYNC_MARKS 10
@@ -9283,6 +9506,12 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+ if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) {
+ sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j);
+ if (sectors)
+ goto update;
+ }
+
sectors = mddev->pers->sync_request(mddev, j, max_sectors,
&skipped);
if (sectors == 0) {
@@ -9298,6 +9527,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+update:
j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
@@ -9607,6 +9837,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
return true;
}
@@ -9615,6 +9846,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
remove_spares(mddev, NULL);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
return true;
}
@@ -9624,7 +9856,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
* re-add.
*/
*spares = remove_and_add_spares(mddev, NULL);
- if (*spares) {
+ if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -9682,7 +9914,7 @@ static void md_start_sync(struct work_struct *ws)
* We are adding a device or devices to an array which has the bitmap
* stored on all devices. So make sure all bitmap pages get written.
*/
- if (spares)
+ if (spares && md_bitmap_enabled(mddev, true))
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
@@ -9770,7 +10002,7 @@ static void unregister_sync_thread(struct mddev *mddev)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (mddev->bitmap)
+ if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work)
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
@@ -9837,6 +10069,7 @@ void md_check_recovery(struct mddev *mddev)
}
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
@@ -9947,6 +10180,7 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
/*
* We call mddev->cluster_ops->update_size here because sync_size could
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
@@ -10094,8 +10328,16 @@ static void md_geninit(void)
static int __init md_init(void)
{
- int ret = -ENOMEM;
+ int ret = md_bitmap_init();
+
+ if (ret)
+ return ret;
+ ret = md_llbitmap_init();
+ if (ret)
+ goto err_bitmap;
+
+ ret = -ENOMEM;
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
goto err_wq;
@@ -10104,11 +10346,6 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
- md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
- 0);
- if (!md_bitmap_wq)
- goto err_bitmap_wq;
-
ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
goto err_md;
@@ -10127,12 +10364,13 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
- destroy_workqueue(md_bitmap_wq);
-err_bitmap_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
err_wq:
+ md_llbitmap_exit();
+err_bitmap:
+ md_bitmap_exit();
return ret;
}
@@ -10150,7 +10388,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
if (ret)
pr_info("md-cluster: resize failed\n");
- else
+ else if (md_bitmap_enabled(mddev, false))
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
@@ -10438,8 +10676,8 @@ static __exit void md_exit(void)
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
- destroy_workqueue(md_bitmap_wq);
destroy_workqueue(md_wq);
+ md_bitmap_exit();
}
subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 51af29a03079..1979c2d4fe89 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -26,7 +26,7 @@
enum md_submodule_type {
MD_PERSONALITY = 0,
MD_CLUSTER,
- MD_BITMAP, /* TODO */
+ MD_BITMAP,
};
enum md_submodule_id {
@@ -38,8 +38,9 @@ enum md_submodule_id {
ID_RAID6 = 6,
ID_RAID10 = 10,
ID_CLUSTER,
- ID_BITMAP, /* TODO */
- ID_LLBITMAP, /* TODO */
+ ID_BITMAP,
+ ID_LLBITMAP,
+ ID_BITMAP_NONE,
};
struct md_submodule_head {
@@ -565,6 +566,7 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
+ enum md_submodule_id bitmap_id;
void *bitmap; /* the bitmap for the device */
struct bitmap_operations *bitmap_ops;
struct {
@@ -665,6 +667,8 @@ enum recovery_flags {
MD_RECOVERY_RESHAPE,
/* remote node is running resync thread */
MD_RESYNCING_REMOTE,
+ /* raid456 lazy initial recover */
+ MD_RECOVERY_LAZY_RECOVER,
};
enum md_ro_state {
@@ -796,7 +800,6 @@ struct md_sysfs_entry {
ssize_t (*show)(struct mddev *, char *);
ssize_t (*store)(struct mddev *, const char *, size_t);
};
-extern const struct attribute_group md_bitmap_group;
static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
{
@@ -873,6 +876,7 @@ struct md_io_clone {
unsigned long start_time;
sector_t offset;
unsigned long sectors;
+ enum stat_group rw;
struct bio bio_clone;
};
@@ -909,8 +913,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
-extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page);
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset);
extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, blk_opf_t opf, bool metadata_op);
@@ -1013,7 +1018,6 @@ struct mdu_array_info_s;
struct mdu_disk_info_s;
extern int mdp_major;
-extern struct workqueue_struct *md_bitmap_wq;
void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
@@ -1034,6 +1038,12 @@ static inline bool mddev_is_dm(struct mddev *mddev)
return !mddev->gendisk;
}
+static inline bool raid_is_456(struct mddev *mddev)
+{
+ return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 ||
+ mddev->level == ID_RAID6;
+}
+
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 419139ad7663..e443e478645a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -464,21 +464,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
- struct bio *split = bio_split(bio,
- zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
- &mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio = bio_submit_split_bioset(bio,
+ zone->zone_end - bio->bi_iter.bi_sector,
+ &mddev->bio_set);
+ if (!bio)
return;
- }
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
end = zone->zone_end;
- } else
+ } else {
end = bio_end_sector(bio);
+ }
orig_end = end;
if (zone != conf->strip_zone)
@@ -613,17 +608,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
: sector_div(sector, chunk_sects));
if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ bio = bio_submit_split_bioset(bio, sectors,
&mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ if (!bio)
return true;
- }
- bio_chain(split, bio);
- raid0_map_submit_bio(mddev, bio);
- bio = split;
}
raid0_map_submit_bio(mddev, bio);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 52881e6032da..521625756128 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
- if (!mddev->bitmap_ops->enabled(mddev)) {
+ if (!md_bitmap_enabled(mddev, true)) {
raid1_submit_write(bio);
return true;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d30b82beeb92..592a40233004 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r1_bio->bios[j] = bio;
}
/*
@@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct raid1_info *mirror;
struct bio *read_bio;
int max_sectors;
- int rdisk, error;
+ int rdisk;
bool r1bio_existed = !!r1_bio;
/*
@@ -1366,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
- if (test_bit(WriteMostly, &mirror->rdev->flags)) {
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ md_bitmap_enabled(mddev, false)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
@@ -1376,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1413,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
err_handle:
atomic_dec(&mirror->rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -1452,12 +1448,36 @@ retry:
return true;
}
+static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio,
+ struct bio *bio)
+{
+ unsigned long max_write_behind = mddev->bitmap_info.max_write_behind;
+ struct md_bitmap_stats stats;
+ int err;
+
+ /* behind write rely on bitmap, see bitmap_operations */
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return;
+
+ /* Don't do behind IO if reader is waiting, or there are too many. */
+ if (!stats.behind_wait && stats.behind_writes < max_write_behind)
+ alloc_behind_master_bio(r1_bio, bio);
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
+
+}
+
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int max_write_sectors)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- int i, disks, k, error;
+ int i, disks, k;
unsigned long flags;
int first_clone;
int max_sectors;
@@ -1561,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* complexity of supporting that is not worth
* the benefit.
*/
- if (bio->bi_opf & REQ_ATOMIC) {
- error = -EIO;
+ if (bio->bi_opf & REQ_ATOMIC)
goto err_handle;
- }
good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
@@ -1584,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1612,22 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
- unsigned long max_write_behind =
- mddev->bitmap_info.max_write_behind;
- struct md_bitmap_stats stats;
- int err;
-
- /* do behind I/O ?
- * Not if there are too many, or cannot
- * allocate memory, or a reader on WriteMostly
- * is waiting for behind writes to flush */
- err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
- if (!err && write_behind && !stats.behind_wait &&
- stats.behind_writes < max_write_behind)
- alloc_behind_master_bio(r1_bio, bio);
-
- if (test_bit(R1BIO_BehindIO, &r1_bio->state))
- mddev->bitmap_ops->start_behind_write(mddev);
+ if (write_behind)
+ raid1_start_write_behind(mddev, r1_bio, bio);
first_clone = 0;
}
@@ -1683,8 +1684,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -2057,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
- mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
+ md_bitmap_end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2804,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2829,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2846,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
-
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
- mddev_is_clustered(mddev) &&
- (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS >
+ conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -3004,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks, still_degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr,
+ &sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3325,15 +3326,17 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
- int ret;
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index d236ef179cfb..2ebe35aaa534 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -178,7 +178,9 @@ enum r1bio_state {
* any write was successful. Otherwise we call when
* any write-behind write succeeds, otherwise we call
* with failure when last write completes (and all failed).
- * Record that bi_end_io was called with this flag...
+ *
+ * And for bio_split errors, record that bi_end_io was called
+ * with this flag...
*/
R1BIO_Returned,
/* If a write for this request means we can clear some
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9832eefb2f15..14dcd5142eb4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].bio = bio;
if (!conf->have_replacement)
continue;
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].repl_bio = bio;
}
/*
@@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
struct bio *bio = r10_bio->master_bio;
struct r10conf *conf = r10_bio->mddev->private;
- if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- bio->bi_status = BLK_STS_IOERR;
+ if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) {
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
- bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -1154,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
int slot = r10_bio->read_slot;
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
- int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
@@ -1203,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
rdev->bdev,
(unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
}
@@ -1241,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
return;
err_handle:
atomic_dec(&rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1351,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int i, k;
sector_t sectors;
int max_sectors;
- int error;
if ((mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE,
@@ -1465,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* complexity of supporting that is not worth
* the benefit.
*/
- if (bio->bi_opf & REQ_ATOMIC) {
- error = -EIO;
+ if (bio->bi_opf & REQ_ATOMIC)
goto err_handle;
- }
good_sectors = first_bad - dev_sector;
if (good_sectors < max_sectors)
@@ -1489,17 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->sectors = max_sectors;
if (r10_bio->sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, r10_bio->sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
}
@@ -1531,8 +1523,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1679,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the fist split part */
submit_bio_noacct(split);
@@ -1694,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the second split part */
submit_bio_noacct(bio);
@@ -3221,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- mddev->bitmap_ops->end_sync(mddev,
- mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
- mddev->bitmap_ops->end_sync(mddev, sect,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, sect, &sync_blocks);
}
} else {
/* completed sync */
@@ -3249,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3351,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks,
- true);
+ must_sync = md_bitmap_start_sync(mddev, sect,
+ &sync_blocks, true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3396,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks, still_degraded);
-
+ md_bitmap_start_sync(mddev, sect, &sync_blocks,
+ still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
int k;
@@ -3570,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks,
- mddev->degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks,
+ mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -4226,7 +4217,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
- int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4240,9 +4230,12 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > size)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, size, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
@@ -4508,8 +4501,9 @@ static int raid10_start_reshape(struct mddev *mddev)
oldsize = raid10_size(mddev, 0, 0);
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
- if (!mddev_is_clustered(mddev)) {
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ if (!mddev_is_clustered(mddev) &&
+ md_bitmap_enabled(mddev, false)) {
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
else
@@ -4531,13 +4525,14 @@ static int raid10_start_reshape(struct mddev *mddev)
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ /* cluster can't be setup without bitmap */
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
- mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
+ mddev->bitmap_ops->resize(mddev, oldsize, 0);
goto abort;
}
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3f16ad6904a9..da00a55f7a55 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -165,6 +165,8 @@ enum r10bio_state {
* so that raid10d knows what to do with them.
*/
R10BIO_ReadError,
+/* For bio_split errors, record that bi_end_io was called. */
+ R10BIO_Returned,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e385ef1355e8..24b32a0c95b4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
int disks)
{
int rmw = 0, rcw = 0, i;
- sector_t resync_offset = conf->mddev->resync_offset;
+ struct mddev *mddev = conf->mddev;
+ sector_t resync_offset = mddev->resync_offset;
/* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
@@ -4116,6 +4117,12 @@ static int handle_stripe_dirtying(struct r5conf *conf,
pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
conf->rmw_level, (unsigned long long)resync_offset,
(unsigned long long)sh->sector);
+ } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced &&
+ !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) {
+ /* The initial recover is not done, must read everything */
+ rcw = 1; rmw = 2;
+ pr_debug("force RCW by lazy recovery, sh->sector=%llu\n",
+ sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
@@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+ mddev_add_trace_msg(mddev, "raid5 rmw %llu %d",
sh->sector, rmw);
for (i = disks; i--; ) {
@@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
- if (rcw && !mddev_is_dm(conf->mddev))
- blk_add_trace_msg(conf->mddev->gendisk->queue,
+ if (rcw && !mddev_is_dm(mddev))
+ blk_add_trace_msg(mddev->gendisk->queue,
"raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
@@ -4698,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
- /* in sync if before recovery_offset */
- set_bit(R5_Insync, &dev->flags);
- else if (test_bit(R5_UPTODATE, &dev->flags) &&
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <=
+ rdev->recovery_offset) {
+ /*
+ * in sync if:
+ * - normal IO, or
+ * - resync IO that is not lazy recovery
+ *
+ * For lazy recovery, we have to mark the rdev without
+ * In_sync as failed, to build initial xor data.
+ */
+ if (!test_bit(STRIPE_SYNCING, &sh->state) ||
+ !test_bit(MD_RECOVERY_LAZY_RECOVER,
+ &conf->mddev->recovery))
+ set_bit(R5_Insync, &dev->flags);
+ } else if (test_bit(R5_UPTODATE, &dev->flags) &&
test_bit(R5_Expanded, &dev->flags))
/* If we've reshaped into here, we assume it is Insync.
* We will shortly update recovery_offset to make
@@ -5468,17 +5486,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
- struct bio *split;
sector_t sector = raid_bio->bi_iter.bi_sector;
unsigned chunk_sects = mddev->chunk_sectors;
unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
if (sectors < bio_sectors(raid_bio)) {
struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
- bio_chain(split, raid_bio);
- submit_bio_noacct(raid_bio);
- raid_bio = split;
+
+ raid_bio = bio_submit_split_bioset(raid_bio, sectors,
+ &conf->bio_split);
+ if (!raid_bio)
+ return NULL;
}
if (!raid5_read_one_chunk(mddev, raid_bio))
@@ -6492,11 +6510,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
@@ -6525,8 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- true) &&
+ !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6535,7 +6553,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6557,9 +6576,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
still_degraded = true;
}
- mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- still_degraded);
-
+ md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6763,7 +6780,8 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- mddev->bitmap_ops->unplug(mddev, true);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -8313,7 +8331,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
- int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8323,9 +8340,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, sectors, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index d34892782f6e..1af157ce0a63 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb)
msb->card = NULL;
}
-static int msb_bd_getgeo(struct block_device *bdev,
+static int msb_bd_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct msb_data *msb = bdev->bd_disk->private_data;
+ struct msb_data *msb = disk->private_data;
*geo = msb->geometry;
return 0;
}
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c9853d887d28..075519caa547 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk)
kfree(msb);
}
-static int mspro_block_bd_getgeo(struct block_device *bdev,
+static int mspro_block_bd_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct mspro_block_data *msb = bdev->bd_disk->private_data;
+ struct mspro_block_data *msb = disk->private_data;
geo->heads = msb->heads;
geo->sectors = msb->sectors_per_track;
diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c
index 3a64dc7a7e27..3304f8824cf7 100644
--- a/drivers/message/fusion/mptscsih.c
+++ b/drivers/message/fusion/mptscsih.c
@@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf,
* This is anyones guess quite frankly.
*/
int
-mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev,
+mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads;
diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h
index 8c2bb2331fc1..f9678d48100c 100644
--- a/drivers/message/fusion/mptscsih.h
+++ b/drivers/message/fusion/mptscsih.h
@@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt);
extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt);
extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt);
extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt);
-extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]);
+extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]);
extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index a74e75df93b0..9399bf6c766a 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -439,9 +439,9 @@ static void mmc_blk_release(struct gendisk *disk)
}
static int
-mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16);
+ geo->cylinders = get_capacity(disk) / (4 * 16);
geo->heads = 4;
geo->sectors = 16;
return 0;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 847c11542f02..28e09d080440 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -246,9 +246,9 @@ unlock:
blktrans_dev_put(dev);
}
-static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
+ struct mtd_blktrans_dev *dev = disk->private_data;
int ret = -ENXIO;
mutex_lock(&dev->lock);
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 39cc0a6a4d37..b53fd147fa65 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd)
mutex_unlock(&dev->dev_mutex);
}
-static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* Some tools might require this information */
geo->heads = 1;
geo->cylinders = 1;
- geo->sectors = get_capacity(bdev->bd_disk);
+ geo->sectors = get_capacity(disk);
geo->start = 0;
return 0;
}
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 2a1aa32e6693..a933db961ed7 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio)
bio_endio(bio);
}
-static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* some standard values */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
return 0;
}
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 91e273b89fea..1f51fbebd9fa 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -684,6 +684,59 @@ out_free_enc:
EXPORT_SYMBOL_GPL(nvme_auth_generate_digest);
/**
+ * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1)
+ * @hmac_tfm: hash context keyed with pseudorandom key
+ * @label: ASCII label without "tls13 " prefix
+ * @labellen: length of @label
+ * @context: context bytes
+ * @contextlen: length of @context
+ * @okm: output keying material
+ * @okmlen: length of @okm
+ *
+ * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand().
+ *
+ * Returns 0 on success with output keying material stored in @okm,
+ * or a negative errno value otherwise.
+ */
+static int hkdf_expand_label(struct crypto_shash *hmac_tfm,
+ const u8 *label, unsigned int labellen,
+ const u8 *context, unsigned int contextlen,
+ u8 *okm, unsigned int okmlen)
+{
+ int err;
+ u8 *info;
+ unsigned int infolen;
+ const char *tls13_prefix = "tls13 ";
+ unsigned int prefixlen = strlen(tls13_prefix);
+
+ if (WARN_ON(labellen > (255 - prefixlen)))
+ return -EINVAL;
+ if (WARN_ON(contextlen > 255))
+ return -EINVAL;
+
+ infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen);
+ info = kzalloc(infolen, GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ /* HkdfLabel.Length */
+ put_unaligned_be16(okmlen, info);
+
+ /* HkdfLabel.Label */
+ info[2] = prefixlen + labellen;
+ memcpy(info + 3, tls13_prefix, prefixlen);
+ memcpy(info + 3 + prefixlen, label, labellen);
+
+ /* HkdfLabel.Context */
+ info[3 + prefixlen + labellen] = contextlen;
+ memcpy(info + 4 + prefixlen + labellen, context, contextlen);
+
+ err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen);
+ kfree_sensitive(info);
+ return err;
+}
+
+/**
* nvme_auth_derive_tls_psk - Derive TLS PSK
* @hmac_id: Hash function identifier
* @psk: generated input PSK
@@ -715,10 +768,10 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
{
struct crypto_shash *hmac_tfm;
const char *hmac_name;
- const char *psk_prefix = "tls13 nvme-tls-psk";
+ const char *label = "nvme-tls-psk";
static const char default_salt[HKDF_MAX_HASHLEN];
- size_t info_len, prk_len;
- char *info;
+ size_t prk_len;
+ const char *ctx;
unsigned char *prk, *tls_key;
int ret;
@@ -758,36 +811,29 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
if (ret)
goto out_free_prk;
- /*
- * 2 additional bytes for the length field from HDKF-Expand-Label,
- * 2 additional bytes for the HMAC ID, and one byte for the space
- * separator.
- */
- info_len = strlen(psk_digest) + strlen(psk_prefix) + 5;
- info = kzalloc(info_len + 1, GFP_KERNEL);
- if (!info) {
+ ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest);
+ if (!ctx) {
ret = -ENOMEM;
goto out_free_prk;
}
- put_unaligned_be16(psk_len, info);
- memcpy(info + 2, psk_prefix, strlen(psk_prefix));
- sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest);
-
tls_key = kzalloc(psk_len, GFP_KERNEL);
if (!tls_key) {
ret = -ENOMEM;
- goto out_free_info;
+ goto out_free_ctx;
}
- ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len);
+ ret = hkdf_expand_label(hmac_tfm,
+ label, strlen(label),
+ ctx, strlen(ctx),
+ tls_key, psk_len);
if (ret) {
kfree(tls_key);
- goto out_free_info;
+ goto out_free_ctx;
}
*ret_psk = tls_key;
-out_free_info:
- kfree(info);
+out_free_ctx:
+ kfree(ctx);
out_free_prk:
kfree(prk);
out_free_shash:
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 201fc8809a62..012fcfc79a73 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -331,9 +331,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
} else {
memset(chap->c2, 0, chap->hash_len);
}
- if (ctrl->opts->concat)
+ if (ctrl->opts->concat) {
chap->s2 = 0;
- else
+ chap->bi_directional = false;
+ } else
chap->s2 = nvme_auth_get_seqnum();
data->seqnum = cpu_to_le32(chap->s2);
if (chap->host_key_len) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6b7493934535..fa4181d7de73 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1807,12 +1807,12 @@ static void nvme_release(struct gendisk *disk)
nvme_ns_release(disk->private_data);
}
-int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* some standard values */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
return 0;
}
@@ -3167,6 +3167,11 @@ static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
return ctrl->cntrltype == NVME_CTRL_ADMIN;
}
+static inline bool nvme_is_io_ctrl(struct nvme_ctrl *ctrl)
+{
+ return !nvme_discovery_ctrl(ctrl) && !nvme_admin_ctrl(ctrl);
+}
+
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
@@ -3369,7 +3374,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
else
ctrl->max_zeroes_sectors = 0;
- if (ctrl->subsys->subtype != NVME_NQN_NVME ||
+ if (!nvme_is_io_ctrl(ctrl) ||
!nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) ||
test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
return 0;
@@ -3491,14 +3496,14 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL;
}
- if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
+ if (nvme_is_io_ctrl(ctrl) && ctrl->ioccsz < 4) {
dev_err(ctrl->device,
"I/O queue command capsule supported size %d < 4\n",
ctrl->ioccsz);
return -EINVAL;
}
- if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
+ if (nvme_is_io_ctrl(ctrl) && ctrl->iorcsz < 1) {
dev_err(ctrl->device,
"I/O queue response capsule supported size %d < 1\n",
ctrl->iorcsz);
@@ -4990,8 +4995,14 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
* checking that they started once before, hence are reconnecting back.
*/
if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
- nvme_discovery_ctrl(ctrl))
+ nvme_discovery_ctrl(ctrl)) {
+ if (!ctrl->kato) {
+ nvme_stop_keep_alive(ctrl);
+ ctrl->kato = NVME_DEFAULT_KATO;
+ nvme_start_keep_alive(ctrl);
+ }
nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
+ }
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 3e12d4683ac7..03987f497a5b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3032,11 +3032,17 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
++ctrl->ctrl.nr_reconnects;
- if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+ spin_lock_irqsave(&ctrl->rport->lock, flags);
+ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) {
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
return -ENODEV;
+ }
- if (nvme_fc_ctlr_active_on_rport(ctrl))
+ if (nvme_fc_ctlr_active_on_rport(ctrl)) {
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
return -ENOTUNIQ;
+ }
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: create association : host wwpn 0x%016llx "
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index e28bb9113f64..c212fa952c0f 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -142,14 +142,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0,
0, rq_data_dir(req));
-
if (ret)
return ret;
- bio = req->bio;
- if (bdev)
- bio_set_dev(bio, bdev);
-
if (has_metadata) {
ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
if (ret)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cfd2b5b90b91..102fae6a231c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id);
-int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_attr_groups[];
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2c6d9506b172..c916176bd9f0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -172,9 +172,7 @@ struct nvme_dev {
u32 last_ps;
bool hmb;
struct sg_table *hmb_sgt;
-
mempool_t *dmavec_mempool;
- mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
@@ -261,6 +259,9 @@ enum nvme_iod_flags {
/* single segment dma mapping */
IOD_SINGLE_SEGMENT = 1U << 2,
+
+ /* Metadata using non-coalesced MPTR */
+ IOD_SINGLE_META_SEGMENT = 1U << 5,
};
struct nvme_dma_vec {
@@ -284,7 +285,8 @@ struct nvme_iod {
unsigned int nr_dma_vecs;
dma_addr_t meta_dma;
- struct sg_table meta_sgt;
+ unsigned int meta_total_len;
+ struct dma_iova_state meta_dma_state;
struct nvme_sgl_desc *meta_descriptor;
};
@@ -641,6 +643,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
return nvmeq->descriptor_pools.large;
}
+static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd)
+{
+ return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG;
+}
+
static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
{
return cmd->common.flags &
@@ -690,25 +697,52 @@ static void nvme_free_prps(struct request *req)
mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
}
-static void nvme_free_sgls(struct request *req)
+static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
+ struct nvme_sgl_desc *sg_list)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int len = le32_to_cpu(sge->length);
struct device *dma_dev = nvmeq->dev->dev;
- dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
- unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
- struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ unsigned int i;
+
+ if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
+ dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir);
+ return;
+ }
+
+ for (i = 0; i < len / sizeof(*sg_list); i++)
+ dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
+}
+
+static void nvme_unmap_metadata(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
enum dma_data_direction dir = rq_dma_dir(req);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct device *dma_dev = nvmeq->dev->dev;
+ struct nvme_sgl_desc *sge = iod->meta_descriptor;
- if (iod->nr_descriptors) {
- unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+ if (iod->flags & IOD_SINGLE_META_SEGMENT) {
+ dma_unmap_page(dma_dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
+ }
- for (i = 0; i < nr_entries; i++)
- dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
- le32_to_cpu(sg_list[i].length), dir);
- } else {
- dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+ if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+ iod->meta_total_len)) {
+ if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
+ nvme_free_sgls(req, sge, &sge[1]);
+ else
+ dma_unmap_page(dma_dev, iod->meta_dma,
+ iod->meta_total_len, dir);
}
+
+ if (iod->meta_descriptor)
+ dma_pool_free(nvmeq->descriptor_pools.small,
+ iod->meta_descriptor, iod->meta_dma);
}
static void nvme_unmap_data(struct request *req)
@@ -727,7 +761,8 @@ static void nvme_unmap_data(struct request *req)
if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
if (nvme_pci_cmd_use_sgl(&iod->cmd))
- nvme_free_sgls(req);
+ nvme_free_sgls(req, iod->descriptors[0],
+ &iod->cmd.common.dptr.sgl);
else
nvme_free_prps(req);
}
@@ -1007,70 +1042,70 @@ static blk_status_t nvme_map_data(struct request *req)
return nvme_pci_setup_data_prp(req, &iter);
}
-static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
-{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
- sge->type = NVME_SGL_FMT_DATA_DESC << 4;
-}
-
static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
+ unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_dev *dev = nvmeq->dev;
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sgl, *sg;
- unsigned int entries;
+ struct blk_dma_iter iter;
dma_addr_t sgl_dma;
- int rc, i;
+ int i = 0;
- iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
- if (!iod->meta_sgt.sgl)
- return BLK_STS_RESOURCE;
+ if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+ &iod->meta_dma_state, &iter))
+ return iter.status;
- sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
- iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
- iod->meta_sgt.sgl);
- if (!iod->meta_sgt.orig_nents)
- goto out_free_sg;
+ if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+ entries = 1;
- rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc)
- goto out_free_sg;
+ /*
+ * The NVMe MPTR descriptor has an implicit length that the host and
+ * device must agree on to avoid data/memory corruption. We trust the
+ * kernel allocated correctly based on the format's parameters, so use
+ * the more efficient MPTR to avoid extra dma pool allocations for the
+ * SGL indirection.
+ *
+ * But for user commands, we don't necessarily know what they do, so
+ * the driver can't validate the metadata buffer size. The SGL
+ * descriptor provides an explicit length, so we're relying on that
+ * mechanism to catch any misunderstandings between the application and
+ * device.
+ */
+ if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
+ iod->cmd.common.metadata = cpu_to_le64(iter.addr);
+ iod->meta_total_len = iter.len;
+ iod->meta_dma = iter.addr;
+ iod->meta_descriptor = NULL;
+ return BLK_STS_OK;
+ }
sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
&sgl_dma);
if (!sg_list)
- goto out_unmap_sg;
+ return BLK_STS_RESOURCE;
- entries = iod->meta_sgt.nents;
iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
-
iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
-
- sgl = iod->meta_sgt.sgl;
if (entries == 1) {
- nvme_pci_sgl_set_data_sg(sg_list, sgl);
+ iod->meta_total_len = iter.len;
+ nvme_pci_sgl_set_data(sg_list, &iter);
return BLK_STS_OK;
}
sgl_dma += sizeof(*sg_list);
- nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
- for_each_sg(sgl, sg, entries, i)
- nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
-
- return BLK_STS_OK;
+ do {
+ nvme_pci_sgl_set_data(&sg_list[++i], &iter);
+ iod->meta_total_len += iter.len;
+ } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
- return BLK_STS_RESOURCE;
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
+ if (unlikely(iter.status))
+ nvme_unmap_metadata(req);
+ return iter.status;
}
static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
@@ -1083,6 +1118,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
+ iod->flags |= IOD_SINGLE_META_SEGMENT;
return BLK_STS_OK;
}
@@ -1104,7 +1140,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
iod->flags = 0;
iod->nr_descriptors = 0;
iod->total_len = 0;
- iod->meta_sgt.nents = 0;
+ iod->meta_total_len = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
@@ -1215,25 +1251,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
*rqlist = requeue_list;
}
-static __always_inline void nvme_unmap_metadata(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
-
- if (!iod->meta_sgt.nents) {
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len,
- rq_dma_dir(req));
- return;
- }
-
- dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
- iod->meta_dma);
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-}
-
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
if (blk_integrity_rq(req))
@@ -3039,7 +3056,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
- size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
dev->dmavec_mempool = mempool_create_node(1,
@@ -3048,17 +3064,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->dmavec_mempool)
return -ENOMEM;
-
- dev->iod_meta_mempool = mempool_create_node(1,
- mempool_kmalloc, mempool_kfree,
- (void *)meta_size, GFP_KERNEL,
- dev_to_node(dev->dev));
- if (!dev->iod_meta_mempool)
- goto free;
return 0;
-free:
- mempool_destroy(dev->dmavec_mempool);
- return -ENOMEM;
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3324,10 +3330,12 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
* Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND
* because of high power consumption (> 2 Watt) in s2idle
* sleep. Only some boards with Intel CPU are affected.
+ * (Note for testing: Samsung 990 Evo Plus has same PCI ID)
*/
if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") ||
dmi_match(DMI_BOARD_NAME, "GMxPXxx") ||
dmi_match(DMI_BOARD_NAME, "GXxMRXx") ||
+ dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "PH4PG31") ||
dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") ||
dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71"))
@@ -3508,7 +3516,6 @@ out_disable:
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->dmavec_mempool);
- mempool_destroy(dev->iod_meta_mempool);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@@ -3572,7 +3579,6 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->dmavec_mempool);
- mempool_destroy(dev->iod_meta_mempool);
nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c0fe8cfb7229..1413788ca7d5 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2250,6 +2250,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_cleanup_tagset;
+ if (ctrl->opts->concat && !ctrl->tls_pskid)
+ return 0;
+
error = nvme_enable_ctrl(ctrl);
if (error)
goto out_stop_queue;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 0dd7bd99afa3..5d7d483bfbe3 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -513,9 +513,6 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
return 0;
}
-/*
- * Note: ctrl->subsys->lock should be held when calling this function
- */
static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
struct nvmet_ns *ns)
{
@@ -523,6 +520,8 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
struct pci_dev *p2p_dev;
int ret;
+ lockdep_assert_held(&ctrl->subsys->lock);
+
if (!ctrl->p2p_client || !ns->use_p2pmem)
return;
@@ -1539,15 +1538,14 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
return false;
}
-/*
- * Note: ctrl->subsys->lock should be held when calling this function
- */
static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
struct device *p2p_client)
{
struct nvmet_ns *ns;
unsigned long idx;
+ lockdep_assert_held(&ctrl->subsys->lock);
+
if (!p2p_client)
return;
@@ -1557,14 +1555,13 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
-/*
- * Note: ctrl->subsys->lock should be held when calling this function
- */
static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
{
struct radix_tree_iter iter;
void __rcu **slot;
+ lockdep_assert_held(&ctrl->subsys->lock);
+
radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
pci_dev_put(radix_tree_deref_slot(slot));
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index a9b18c051f5b..7d84527d5a43 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -54,6 +54,8 @@ struct nvmet_fc_ls_req_op { /* for an LS RQST XMT */
int ls_error;
struct list_head lsreq_list; /* tgtport->ls_req_list */
bool req_queued;
+
+ struct work_struct put_work;
};
@@ -111,8 +113,6 @@ struct nvmet_fc_tgtport {
struct nvmet_fc_port_entry *pe;
struct kref ref;
u32 max_sg_cnt;
-
- struct work_struct put_work;
};
struct nvmet_fc_port_entry {
@@ -235,12 +235,13 @@ static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue);
static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
-static void nvmet_fc_put_tgtport_work(struct work_struct *work)
+static void nvmet_fc_put_lsop_work(struct work_struct *work)
{
- struct nvmet_fc_tgtport *tgtport =
- container_of(work, struct nvmet_fc_tgtport, put_work);
+ struct nvmet_fc_ls_req_op *lsop =
+ container_of(work, struct nvmet_fc_ls_req_op, put_work);
- nvmet_fc_tgtport_put(tgtport);
+ nvmet_fc_tgtport_put(lsop->tgtport);
+ kfree(lsop);
}
static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
@@ -367,7 +368,7 @@ __nvmet_fc_finish_ls_req(struct nvmet_fc_ls_req_op *lsop)
DMA_BIDIRECTIONAL);
out_putwork:
- queue_work(nvmet_wq, &tgtport->put_work);
+ queue_work(nvmet_wq, &lsop->put_work);
}
static int
@@ -388,6 +389,7 @@ __nvmet_fc_send_ls_req(struct nvmet_fc_tgtport *tgtport,
lsreq->done = done;
lsop->req_queued = false;
INIT_LIST_HEAD(&lsop->lsreq_list);
+ INIT_WORK(&lsop->put_work, nvmet_fc_put_lsop_work);
lsreq->rqstdma = fc_dma_map_single(tgtport->dev, lsreq->rqstaddr,
lsreq->rqstlen + lsreq->rsplen,
@@ -447,8 +449,6 @@ nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
__nvmet_fc_finish_ls_req(lsop);
/* fc-nvme target doesn't care about success or failure of cmd */
-
- kfree(lsop);
}
/*
@@ -1075,6 +1075,14 @@ nvmet_fc_delete_assoc_work(struct work_struct *work)
static void
nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc)
{
+ int terminating;
+
+ terminating = atomic_xchg(&assoc->terminating, 1);
+
+ /* if already terminating, do nothing */
+ if (terminating)
+ return;
+
nvmet_fc_tgtport_get(assoc->tgtport);
if (!queue_work(nvmet_wq, &assoc->del_work))
nvmet_fc_tgtport_put(assoc->tgtport);
@@ -1202,13 +1210,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc)
{
struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
unsigned long flags;
- int i, terminating;
-
- terminating = atomic_xchg(&assoc->terminating, 1);
-
- /* if already terminating, do nothing */
- if (terminating)
- return;
+ int i;
spin_lock_irqsave(&tgtport->lock, flags);
list_del_rcu(&assoc->a_list);
@@ -1410,7 +1412,6 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
kref_init(&newrec->ref);
ida_init(&newrec->assoc_cnt);
newrec->max_sg_cnt = template->max_sgl_segments;
- INIT_WORK(&newrec->put_work, nvmet_fc_put_tgtport_work);
ret = nvmet_fc_alloc_ls_iodlist(newrec);
if (ret) {
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 257b497d515a..5dffcc5becae 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -496,13 +496,15 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
if (!targetport) {
/*
* The target port is gone. The target doesn't expect any
- * response anymore and the ->done call is not valid
- * because the resources have been freed by
- * nvmet_fc_free_pending_reqs.
+ * response anymore and thus lsreq can't be accessed anymore.
*
* We end up here from delete association exchange:
* nvmet_fc_xmt_disconnect_assoc sends an async request.
+ *
+ * Return success because this is what LLDDs do; silently
+ * drop the response.
*/
+ lsrsp->done(lsrsp);
kmem_cache_free(lsreq_cache, tls_req);
return 0;
}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 506a947d00a5..7765e40f7cea 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -334,6 +334,11 @@ static int dasd_state_basic_to_ready(struct dasd_device *device)
lim.max_dev_sectors = device->discipline->max_sectors(block);
lim.max_hw_sectors = lim.max_dev_sectors;
lim.logical_block_size = block->bp_block;
+ /*
+ * Adjust dma_alignment to match block_size - 1
+ * to ensure proper buffer alignment checks in the block layer.
+ */
+ lim.dma_alignment = lim.logical_block_size - 1;
if (device->discipline->has_discard) {
unsigned int max_bytes;
@@ -3114,12 +3119,14 @@ static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx,
PTR_ERR(cqr) == -ENOMEM ||
PTR_ERR(cqr) == -EAGAIN) {
rc = BLK_STS_RESOURCE;
- goto out;
+ } else if (PTR_ERR(cqr) == -EINVAL) {
+ rc = BLK_STS_INVAL;
+ } else {
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "CCW creation failed (rc=%ld) on request %p",
+ PTR_ERR(cqr), req);
+ rc = BLK_STS_IOERR;
}
- DBF_DEV_EVENT(DBF_ERR, basedev,
- "CCW creation failed (rc=%ld) on request %p",
- PTR_ERR(cqr), req);
- rc = BLK_STS_IOERR;
goto out;
}
/*
@@ -3317,11 +3324,11 @@ static void dasd_release(struct gendisk *disk)
/*
* Return disk geometry.
*/
-static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
struct dasd_device *base;
- base = dasd_device_from_gendisk(bdev->bd_disk);
+ base = dasd_device_from_gendisk(disk);
if (!base)
return -ENODEV;
@@ -3331,7 +3338,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return -EINVAL;
}
base->discipline->fill_geometry(base->block, geo);
- geo->start = get_start_sect(bdev) >> base->block->s2b_shift;
+ // geo->start is left unchanged by the above
+ geo->start >>= base->block->s2b_shift;
dasd_put_device(base);
return 0;
}
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 883d4a12a172..a377a6f6900a 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1695,7 +1695,7 @@ out:
} /* End twa_reset_sequence() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[])
+static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c
index 8d4174c7107e..e319be7d369c 100644
--- a/drivers/scsi/3w-sas.c
+++ b/drivers/scsi/3w-sas.c
@@ -1404,7 +1404,7 @@ out:
} /* End twl_reset_device_extension() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[])
+static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[])
{
int heads, sectors;
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 89bd56f78ef9..0306a228c702 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev)
} /* End tw_reset_device_extension() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index 82597bd96525..a86d780d1ba4 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset)
the BIOS, and a warning may be displayed.
*/
-static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev,
+static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *params)
{
struct blogic_adapter *adapter =
@@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev,
diskparam->sectors = 32;
}
diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors);
- buf = scsi_bios_ptable(dev);
+ buf = scsi_bios_ptable(disk);
if (buf == NULL)
return 0;
/*
diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h
index 61bf26d4fc10..79de815e33b0 100644
--- a/drivers/scsi/BusLogic.h
+++ b/drivers/scsi/BusLogic.h
@@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets,
static const char *blogic_drvr_info(struct Scsi_Host *);
static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *);
-static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *);
+static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *);
static int blogic_sdev_configure(struct scsi_device *,
struct queue_limits *lim);
static void blogic_qcompleted_ccb(struct blogic_ccb *);
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 4b12e6dd8f07..ea66196ef7c7 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype)
/**
* aac_biosparm - return BIOS parameters for disk
* @sdev: The scsi device corresponding to the disk
- * @bdev: the block device corresponding to the disk
+ * @disk: the gendisk corresponding to the disk
* @capacity: the sector capacity of the disk
* @geom: geometry block to fill in
*
@@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype)
* be displayed.
*/
-static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev,
+static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *geom)
{
struct diskparm *param = (struct diskparm *)geom;
@@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev,
* entry whose end_head matches one of the standard geometry
* translations ( 64/32, 128/32, 255/63 ).
*/
- buf = scsi_bios_ptable(bdev);
+ buf = scsi_bios_ptable(disk);
if (!buf)
return 0;
if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) {
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 3a2c336307c0..063e1b5818d3 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp)
* ip[2]: cylinders
*/
static int
-advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
struct asc_board *boardp = shost_priv(sdev->host);
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index e94c0a19c435..182aa80ec4c6 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt)
* Return the "logical geometry"
*
*/
-static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *info_array)
{
struct Scsi_Host *shpnt = sdev->host;
@@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev
int info[3];
/* try to figure out the geometry from the partition table */
- if (scsicam_bios_param(bdev, capacity, info) < 0 ||
+ if (scsicam_bios_param(disk, capacity, info) < 0 ||
!((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) {
if (EXT_TRANS) {
printk(KERN_NOTICE
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index 389499d3e00a..371e8300f029 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd)
}
static int aha1542_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int geom[])
+ struct gendisk *unused, sector_t capacity, int geom[])
{
struct aha1542_hostdata *aha1542 = shost_priv(sdev->host);
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be7ebbbb9ba8..b234621f6b37 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level,
}
static int aha1740_biosparam(struct scsi_device *sdev,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int* ip)
{
int size = capacity;
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 17dfc3c72110..c3d1b9dd24ae 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* Return the disk geometry for the given SCSI device.
*/
static int
-ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
int heads;
@@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
ahd = *((struct ahd_softc **)sdev->host->hostdata);
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
heads = 64;
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index cebf8c5d0caf..8b2b98666d61 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* Return the disk geometry for the given SCSI device.
*/
static int
-ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
int heads;
@@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
ahc = *((struct ahc_softc **)sdev->host->hostdata);
channel = sdev_channel(sdev);
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
heads = 64;
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index fb57343a97bd..f0c5a30ce51b 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb);
static int arcmsr_abort(struct scsi_cmnd *);
static int arcmsr_bus_reset(struct scsi_cmnd *);
static int arcmsr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int *info);
+ struct gendisk *disk, sector_t capacity, int *info);
static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd);
static int arcmsr_probe(struct pci_dev *pdev,
const struct pci_device_id *id);
@@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id)
}
static int arcmsr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int *geom)
+ struct gendisk *disk, sector_t capacity, int *geom)
{
int heads, sectors, cylinders, total_capacity;
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
total_capacity = capacity;
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 401242912855..df6f40b51deb 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr)
}
-static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev,
+static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused,
sector_t capacity, int *ip)
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c
index 504c4e0c5d17..c0b2a980db34 100644
--- a/drivers/scsi/fdomain.c
+++ b/drivers/scsi/fdomain.c
@@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd)
}
static int fdomain_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity,
+ struct gendisk *disk, sector_t capacity,
int geom[])
{
- unsigned char *p = scsi_bios_ptable(bdev);
+ unsigned char *p = scsi_bios_ptable(disk);
if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */
&& p[4]) { /* Partition type */
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 0821cf994b98..5c602c057798 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand)
* be done in sd.c. Even if it gets fixed there, this will still
* work.
*/
-static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
ip[0] = 0x40;
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index 8648bd965287..ed34ad92c807 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd)
/**
* i91u_biosparam - return the "logical geometry
* @sdev: SCSI device
- * @dev: Matching block device
+ * @unused: Matching gendisk
* @capacity: Sector size of drive
* @info_array: Return space for BIOS geometry
*
@@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd)
* FIXME: limited to 2^32 sector devices.
*/
-static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int *info_array)
{
struct initio_host *host; /* Point to Host adapter control block */
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d06b79f03538..dd6754db7e4c 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev);
/**
* ipr_biosparam - Return the HSC mapping
- * @sdev: scsi device struct
- * @block_device: block device pointer
+ * @sdev: scsi device struct
+ * @unused: gendisk pointer
* @capacity: capacity of the device
- * @parm: Array containing returned HSC values.
+ * @parm: Array containing returned HSC values.
*
* This function generates the HSC parms that fdisk uses.
* We want to make sure we return something that places partitions
@@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev);
* 0 on success
**/
static int ipr_biosparam(struct scsi_device *sdev,
- struct block_device *block_device,
+ struct gendisk *unused,
sector_t capacity, int *parm)
{
int heads, sectors;
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 94adb6ac02a4..3393a288fd23 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue)
/* Set bios geometry for the controller */
/* */
/****************************************************************************/
-static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata;
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index 8ac932ec4444..30a4d4a580e9 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -398,7 +398,7 @@
/*
* Scsi_Host Template
*/
- static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[]);
static int ips_sdev_configure(struct scsi_device *SDptr,
struct queue_limits *lim);
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 928723c90b75..ffa5b49aaf08 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth)
EXPORT_SYMBOL_GPL(sas_change_queue_depth);
int sas_bios_param(struct scsi_device *scsi_dev,
- struct block_device *bdev,
+ struct gendisk *unused,
sector_t capacity, int *hsc)
{
hsc[0] = 255;
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 2006094af418..a00622c0c526 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare
* Return the disk geometry for a particular disk
*/
static int
-megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
adapter_t *adapter;
@@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev,
geom[2] = cylinders;
}
else {
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
dev_info(&adapter->dev->dev,
diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h
index 013fbfb911b9..d6bfd26a8843 100644
--- a/drivers/scsi/megaraid.h
+++ b/drivers/scsi/megaraid.h
@@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *);
static int megaraid_abort(struct scsi_cmnd *);
static int megaraid_reset(struct scsi_cmnd *);
static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int);
-static int megaraid_biosparam(struct scsi_device *, struct block_device *,
+static int megaraid_biosparam(struct scsi_device *, struct gendisk *,
sector_t, int []);
static int mega_build_sglist (adapter_t *adapter, scb_t *scb,
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 615e06fd4ee8..abbbc4b36cd1 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd)
/**
* megasas_bios_param - Returns disk geometry for a disk
* @sdev: device handle
- * @bdev: block device
+ * @unused: gendisk
* @capacity: drive capacity
* @geom: geometry parameters
*/
static int
-megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads;
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index e467b56949e9..3df52a3b435b 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -4031,7 +4031,7 @@ out:
/**
* mpi3mr_bios_param - BIOS param callback
* @sdev: SCSI device reference
- * @bdev: Block device reference
+ * @unused: gendisk reference
* @capacity: Capacity in logical sectors
* @params: Parameter array
*
@@ -4040,7 +4040,7 @@ out:
* Return: 0 always
*/
static int mpi3mr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int params[])
+ struct gendisk *unused, sector_t capacity, int params[])
{
int heads;
int sectors;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 967af259118e..7092d0debef3 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
/**
* scsih_bios_param - fetch head, sector, cylinder info for a disk
* @sdev: scsi device struct
- * @bdev: pointer to block device context
+ * @unused: pointer to gendisk
* @capacity: device size (in 512 byte sectors)
* @params: three element array to place output:
* params[0] number of heads (max 255)
@@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* params[2] number of cylinders
*/
static int
-scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int params[])
{
int heads;
diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c
index 96549e7f5705..bdc2f2f17753 100644
--- a/drivers/scsi/mvumi.c
+++ b/drivers/scsi/mvumi.c
@@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd)
}
static int
-mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors;
diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c
index 486db5b2f05d..b8453c0333dc 100644
--- a/drivers/scsi/myrb.c
+++ b/drivers/scsi/myrb.c
@@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev)
kfree(sdev->hostdata);
}
-static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
struct myrb_hba *cb = shost_priv(sdev->host);
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 278c78d066c4..a3b505240351 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt)
static int
SYM53C500_biosparm(struct scsi_device *disk,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int *info_array)
{
int size;
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index 1ed3171f1797..ea682f3044b6 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand)
* be done in sd.c. Even if it gets fixed there, this will still
* work.
*/
-static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
ip[0] = 0x40;
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 6af018f1ca22..ef841f643171 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd)
}
static int
-qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c
index 3e065d5fc80c..1ce469b7db99 100644
--- a/drivers/scsi/qlogicfas408.c
+++ b/drivers/scsi/qlogicfas408.c
@@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand)
* Return bios parameters
*/
-int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev,
+int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused,
sector_t capacity, int ip[])
{
/* This should mimic the DOS Qlogic driver's behavior exactly */
diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h
index a971db11d293..83ef86c71f2f 100644
--- a/drivers/scsi/qlogicfas408.h
+++ b/drivers/scsi/qlogicfas408.h
@@ -106,7 +106,7 @@ struct qlogicfas408_priv {
irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id);
int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd);
int qlogicfas408_biosparam(struct scsi_device * disk,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int ip[]);
int qlogicfas408_abort(struct scsi_cmnd * cmd);
extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd);
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 19e6c3852d50..887de505bcf9 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -30,9 +30,9 @@
* starting at offset %0x1be.
* Returns: partition table in kmalloc(GFP_KERNEL) memory, or NULL on error.
*/
-unsigned char *scsi_bios_ptable(struct block_device *dev)
+unsigned char *scsi_bios_ptable(struct gendisk *dev)
{
- struct address_space *mapping = bdev_whole(dev)->bd_mapping;
+ struct address_space *mapping = dev->part0->bd_mapping;
unsigned char *res = NULL;
struct folio *folio;
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(scsi_bios_ptable);
/**
* scsi_partsize - Parse cylinders/heads/sectors from PC partition table
- * @bdev: block device to parse
+ * @disk: gendisk of the disk to parse
* @capacity: size of the disk in sectors
* @geom: output in form of [hds, cylinders, sectors]
*
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(scsi_bios_ptable);
*
* Returns: %false on failure, %true on success.
*/
-bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3])
+bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3])
{
int cyl, ext_cyl, end_head, end_cyl, end_sector;
unsigned int logical_end, physical_end, ext_physical_end;
@@ -65,7 +65,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3])
void *buf;
int ret = false;
- buf = scsi_bios_ptable(bdev);
+ buf = scsi_bios_ptable(disk);
if (!buf)
return false;
@@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds
/**
* scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors.
- * @bdev: which device
+ * @disk: which device
* @capacity: size of the disk in sectors
* @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders
*
@@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds
*
* Returns : -1 on failure, 0 on success.
*/
-int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip)
+int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip)
{
u64 capacity64 = capacity; /* Suppress gcc warning */
int ret = 0;
/* try to infer mapping from partition table */
- if (scsi_partsize(bdev, capacity, ip))
+ if (scsi_partsize(disk, capacity, ip))
return 0;
if (capacity64 < (1ULL << 32)) {
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5b8668accf8e..00ad574ce61c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk)
scsi_device_put(sdev);
}
-static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
+ struct scsi_disk *sdkp = scsi_disk(disk);
struct scsi_device *sdp = sdkp->device;
struct Scsi_Host *host = sdp->host;
sector_t capacity = logical_to_sectors(sdp, sdkp->capacity);
@@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
/* override with calculated, extended default, or driver values */
if (host->hostt->bios_param)
- host->hostt->bios_param(sdp, bdev, capacity, diskinfo);
+ host->hostt->bios_param(sdp, disk, capacity, diskinfo);
else
- scsicam_bios_param(bdev, capacity, diskinfo);
+ scsicam_bios_param(disk, capacity, diskinfo);
geo->heads = diskinfo[0];
geo->sectors = diskinfo[1];
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index 63ed7f9aaa93..d8ad02c29320 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work)
}
static int stex_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int geom[])
+ struct gendisk *unused, sector_t capacity, int geom[])
{
int heads = 255, sectors = 63;
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index d9e59204a9c3..dc51ea352198 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice,
return 0;
}
-static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev,
+static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int *info)
{
sector_t nsect = capacity;
diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c
index 5a380eecfc75..0c9987828774 100644
--- a/drivers/scsi/wd719x.c
+++ b/drivers/scsi/wd719x.c
@@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd)
return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED;
}
-static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
if (capacity >= 0x200000) {
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index f991cf759836..db4e09042469 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -861,7 +861,7 @@ new_bio:
bio = bio_kmalloc(nr_vecs, GFP_KERNEL);
if (!bio)
goto fail;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs,
+ bio_init_inline(bio, NULL, nr_vecs,
rw ? REQ_OP_WRITE : REQ_OP_READ);
bio->bi_end_io = pscsi_bi_endio;