diff options
190 files changed, 4534 insertions, 1826 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0ddffc9133d0..0ed10aeff86b 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -603,16 +603,10 @@ Date: July 2003 Contact: linux-block@vger.kernel.org Description: [RW] This controls how many requests may be allocated in the - block layer for read or write requests. Note that the total - allocated number may be twice this amount, since it applies only - to reads or writes (not the accumulated sum). - - To avoid priority inversion through request starvation, a - request queue maintains a separate request pool per each cgroup - when CONFIG_BLK_CGROUP is enabled, and this parameter applies to - each such per-block-cgroup request pool. IOW, if there are N - block cgroups, each request queue may have up to N request - pools, each independently regulated by nr_requests. + block layer. Noted this value only represents the quantity for a + single blk_mq_tags instance. The actual number for the entire + device depends on the hardware queue count, whether elevator is + enabled, and whether tags are shared. What: /sys/block/<disk>/queue/nr_zones diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 4ff2cc291d18..1c2eacc94758 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -347,6 +347,54 @@ All md devices contain: active-idle like active, but no writes have been seen for a while (safe_mode_delay). + consistency_policy + This indicates how the array maintains consistency in case of unexpected + shutdown. It can be: + + none + Array has no redundancy information, e.g. raid0, linear. + + resync + Full resync is performed and all redundancy is regenerated when the + array is started after unclean shutdown. + + bitmap + Resync assisted by a write-intent bitmap. + + journal + For raid4/5/6, journal device is used to log transactions and replay + after unclean shutdown. + + ppl + For raid5 only, Partial Parity Log is used to close the write hole and + eliminate resync. + + The accepted values when writing to this file are ``ppl`` and ``resync``, + used to enable and disable PPL. + + uuid + This indicates the UUID of the array in the following format: + xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + + bitmap_type + [RW] When read, this file will display the current and available + bitmap for this array. The currently active bitmap will be enclosed + in [] brackets. Writing an bitmap name or ID to this file will switch + control of this array to that new bitmap. Note that writing a new + bitmap for created array is forbidden. + + none + No bitmap + bitmap + The default internal bitmap + llbitmap + The lockless internal bitmap + +If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or +llbitmap/xxx will be created after md device KOBJ_CHANGE event. + +If bitmap_type is bitmap, then the md device will also contain: + bitmap/location This indicates where the write-intent bitmap for the array is stored. @@ -401,35 +449,23 @@ All md devices contain: once the array becomes non-degraded, and this fact has been recorded in the metadata. - consistency_policy - This indicates how the array maintains consistency in case of unexpected - shutdown. It can be: - - none - Array has no redundancy information, e.g. raid0, linear. - - resync - Full resync is performed and all redundancy is regenerated when the - array is started after unclean shutdown. - - bitmap - Resync assisted by a write-intent bitmap. +If bitmap_type is llbitmap, then the md device will also contain: - journal - For raid4/5/6, journal device is used to log transactions and replay - after unclean shutdown. + llbitmap/bits + This is read-only, show status of bitmap bits, the number of each + value. - ppl - For raid5 only, Partial Parity Log is used to close the write hole and - eliminate resync. - - The accepted values when writing to this file are ``ppl`` and ``resync``, - used to enable and disable PPL. + llbitmap/metadata + This is read-only, show bitmap metadata, include chunksize, chunkshift, + chunks, offset and daemon_sleep. - uuid - This indicates the UUID of the array in the following format: - xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + llbitmap/daemon_sleep + This is read-write, time in seconds that daemon function will be + triggered to clear dirty bits. + llbitmap/barrier_idle + This is read-write, time in seconds that page barrier will be idled, + means dirty bits in the page will be cleared. As component devices are added to an md array, they appear in the ``md`` directory as new directories named:: diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa287ccdac2f..77704fde9845 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -443,7 +443,7 @@ prototypes:: int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 3ac4c7fafb55..634f5c28a849 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -380,7 +380,7 @@ Details:: /** * scsi_bios_ptable - return copy of block device's partition table - * @dev: pointer to block device + * @dev: pointer to gendisk * * Returns pointer to partition table, or NULL for failure * @@ -390,7 +390,7 @@ Details:: * * Defined in: drivers/scsi/scsicam.c **/ - unsigned char *scsi_bios_ptable(struct block_device *dev) + unsigned char *scsi_bios_ptable(struct gendisk *dev) /** @@ -623,7 +623,7 @@ Details:: * bios_param - fetch head, sector, cylinder info for a disk * @sdev: pointer to scsi device context (defined in * include/scsi/scsi_device.h) - * @bdev: pointer to block device context (defined in fs.h) + * @disk: pointer to gendisk (defined in blkdev.h) * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -643,7 +643,7 @@ Details:: * * Optionally defined in: LLD **/ - int bios_param(struct scsi_device * sdev, struct block_device *bdev, + int bios_param(struct scsi_device * sdev, struct gendisk *disk, sector_t capacity, int params[3]) diff --git a/MAINTAINERS b/MAINTAINERS index c1befdd7f681..0554bf05b426 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4382,7 +4382,7 @@ W: https://rust-for-linux.com B: https://github.com/Rust-for-Linux/linux/issues C: https://rust-for-linux.zulipchat.com/#narrow/stream/Block T: git https://github.com/Rust-for-Linux/linux.git rust-block-next -F: drivers/block/rnull.rs +F: drivers/block/rnull/ F: rust/kernel/block.rs F: rust/kernel/block/ diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h index 64b42d9591fc..5a6239e65097 100644 --- a/arch/alpha/include/asm/floppy.h +++ b/arch/alpha/include/asm/floppy.h @@ -90,25 +90,6 @@ static int FDC2 = -1; #define N_FDC 2 #define N_DRIVE 8 -/* - * Most Alphas have no problems with floppy DMA crossing 64k borders, - * except for certain ones, like XL and RUFFIAN. - * - * However, the test is simple and fast, and this *is* floppy, after all, - * so we do it for all platforms, just to make sure. - * - * This is advantageous in other circumstances as well, as in moving - * about the PCI DMA windows and forcing the floppy to start doing - * scatter-gather when it never had before, and there *is* a problem - * on that platform... ;-} - */ - -static inline unsigned long CROSS_64KB(void *a, unsigned long s) -{ - unsigned long p = (unsigned long)a; - return ((p + s - 1) ^ p) & ~0xffffUL; -} - #define EXTRA_FLOPPY_PARAMS #endif /* __ASM_ALPHA_FLOPPY_H */ diff --git a/arch/arm/include/asm/floppy.h b/arch/arm/include/asm/floppy.h index e1cb04ed5008..e579f77162e9 100644 --- a/arch/arm/include/asm/floppy.h +++ b/arch/arm/include/asm/floppy.h @@ -65,8 +65,6 @@ static unsigned char floppy_selects[4] = { 0x10, 0x21, 0x23, 0x33 }; #define N_FDC 1 #define N_DRIVE 4 -#define CROSS_64KB(a,s) (0) - /* * This allows people to reverse the order of * fd0 and fd1, in case their hardware is diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 83410f8184ec..94a4fadc651a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -77,9 +77,9 @@ static void nfhd_submit_bio(struct bio *bio) bio_endio(bio); } -static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nfhd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct nfhd_device *dev = bdev->bd_disk->private_data; + struct nfhd_device *dev = disk->private_data; geo->cylinders = dev->blocks >> (6 - dev->bshift); geo->heads = 4; diff --git a/arch/m68k/include/asm/floppy.h b/arch/m68k/include/asm/floppy.h index a4d0fea47c6b..dea98bbc0932 100644 --- a/arch/m68k/include/asm/floppy.h +++ b/arch/m68k/include/asm/floppy.h @@ -107,13 +107,9 @@ static void fd_free_irq(void) #define fd_free_dma() /* nothing */ -/* No 64k boundary crossing problems on Q40 - no DMA at all */ -#define CROSS_64KB(a,s) (0) - #define DMA_MODE_READ 0x44 /* i386 look-alike */ #define DMA_MODE_WRITE 0x48 - static int m68k_floppy_init(void) { use_virtual_dma =1; diff --git a/arch/mips/include/asm/floppy.h b/arch/mips/include/asm/floppy.h index 021d09ae5670..44da2ff91f65 100644 --- a/arch/mips/include/asm/floppy.h +++ b/arch/mips/include/asm/floppy.h @@ -34,21 +34,6 @@ static inline void fd_cacheflush(char * addr, long size) #define N_FDC 1 /* do you *really* want a second controller? */ #define N_DRIVE 8 -/* - * The DMA channel used by the floppy controller cannot access data at - * addresses >= 16MB - * - * Went back to the 1MB limit, as some people had problems with the floppy - * driver otherwise. It doesn't matter much for performance anyway, as most - * floppy accesses go through the track buffer. - * - * On MIPSes using vdma, this actually means that *all* transfers go thru - * the * track buffer since 0x1000000 is always smaller than KSEG0/1. - * Actually this needs to be a bit more complicated since the so much different - * hardware available with MIPS CPUs ... - */ -#define CROSS_64KB(a, s) ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64) - #define EXTRA_FLOPPY_PARAMS #include <floppy.h> diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h index b318a7df52f6..f15b69fea901 100644 --- a/arch/parisc/include/asm/floppy.h +++ b/arch/parisc/include/asm/floppy.h @@ -8,9 +8,9 @@ #ifndef __ASM_PARISC_FLOPPY_H #define __ASM_PARISC_FLOPPY_H +#include <linux/sizes.h> #include <linux/vmalloc.h> - /* * The DMA channel used by the floppy controller cannot access data at * addresses >= 16MB @@ -20,15 +20,12 @@ * floppy accesses go through the track buffer. */ #define _CROSS_64KB(a,s,vdma) \ -(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) - -#define CROSS_64KB(a,s) _CROSS_64KB(a,s,use_virtual_dma & 1) - + (!(vdma) && \ + ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K)) #define SW fd_routine[use_virtual_dma&1] #define CSW fd_routine[can_use_virtual_dma & 1] - #define fd_inb(base, reg) readb((base) + (reg)) #define fd_outb(value, base, reg) writeb(value, (base) + (reg)) @@ -206,7 +203,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) { #ifdef FLOPPY_SANITY_CHECK - if (CROSS_64KB(addr, size)) { + if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) { printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); return -1; } diff --git a/arch/powerpc/include/asm/floppy.h b/arch/powerpc/include/asm/floppy.h index 34abf8bea2cc..f4dc657638b3 100644 --- a/arch/powerpc/include/asm/floppy.h +++ b/arch/powerpc/include/asm/floppy.h @@ -206,11 +206,6 @@ static int FDC2 = -1; #define N_FDC 2 /* Don't change this! */ #define N_DRIVE 8 -/* - * The PowerPC has no problems with floppy DMA crossing 64k borders. - */ -#define CROSS_64KB(a,s) (0) - #define EXTRA_FLOPPY_PARAMS #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/floppy_32.h b/arch/sparc/include/asm/floppy_32.h index 836f6575aa1d..7251d1fed7a4 100644 --- a/arch/sparc/include/asm/floppy_32.h +++ b/arch/sparc/include/asm/floppy_32.h @@ -96,9 +96,6 @@ static struct sun_floppy_ops sun_fdops; #define N_FDC 1 #define N_DRIVE 8 -/* No 64k boundary crossing problems on the Sparc. */ -#define CROSS_64KB(a,s) (0) - /* Routines unique to each controller type on a Sun. */ static void sun_set_dor(unsigned char value, int fdc_82077) { diff --git a/arch/sparc/include/asm/floppy_64.h b/arch/sparc/include/asm/floppy_64.h index b0f633ce3518..135f9a49b6ba 100644 --- a/arch/sparc/include/asm/floppy_64.h +++ b/arch/sparc/include/asm/floppy_64.h @@ -95,9 +95,6 @@ static int sun_floppy_types[2] = { 0, 0 }; #define N_FDC 1 #define N_DRIVE 8 -/* No 64k boundary crossing problems on the Sparc. */ -#define CROSS_64KB(a,s) (0) - static unsigned char sun_82077_fd_inb(unsigned long base, unsigned int reg) { udelay(5); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 4de6613e7468..f2b2feeeb455 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,7 +108,7 @@ static DEFINE_MUTEX(ubd_lock); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo); #define MAX_DEV (16) @@ -1324,9 +1324,9 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, return res; } -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct ubd *ubd_dev = bdev->bd_disk->private_data; + struct ubd *ubd_dev = disk->private_data; geo->heads = 128; geo->sectors = 32; diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 6ec3fc969ad5..e7a244051c62 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FLOPPY_H #define _ASM_X86_FLOPPY_H +#include <linux/sizes.h> #include <linux/vmalloc.h> /* @@ -22,10 +23,7 @@ */ #define _CROSS_64KB(a, s, vdma) \ (!(vdma) && \ - ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) - -#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1) - + ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K)) #define SW fd_routine[use_virtual_dma & 1] #define CSW fd_routine[can_use_virtual_dma & 1] @@ -206,7 +204,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) { #ifdef FLOPPY_SANITY_CHECK - if (CROSS_64KB(addr, size)) { + if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) { printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); return -1; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50e51047e1fe..4a8d3d96bfe4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7109,9 +7109,10 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +static void bfq_depth_updated(struct request_queue *q) { - unsigned int nr_requests = bfqd->queue->nr_requests; + struct bfq_data *bfqd = q->elevator->elevator_data; + unsigned int nr_requests = q->nr_requests; /* * In-word depths if no bfq_queue is being weight-raised: @@ -7143,21 +7144,8 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); -} - -static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) -{ - bfq_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void bfq_exit_queue(struct elevator_queue *e) @@ -7369,6 +7357,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + bfq_depth_updated(q); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -7628,7 +7617,6 @@ static struct elevator_type iosched_bfq_mq = { .request_merged = bfq_request_merged, .has_work = bfq_has_work, .depth_updated = bfq_depth_updated, - .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, }, diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6b077ca937f6..bed26f1ec869 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -230,7 +230,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, } static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, - int nr_vecs, ssize_t bytes, ssize_t offset) + int nr_vecs, ssize_t bytes, ssize_t offset, + bool *is_p2p) { unsigned int nr_bvecs = 0; int i, j; @@ -251,6 +252,9 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, bytes -= next; } + if (is_pci_p2pdma_page(pages[i])) + *is_p2p = true; + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); offset = 0; nr_bvecs++; @@ -262,13 +266,13 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + iov_iter_extraction_t extraction_flags = 0; size_t offset, bytes = iter->count; + bool copy, is_p2p = false; unsigned int nr_bvecs; int ret, nr_vecs; - bool copy; if (bio_integrity(bio)) return -EINVAL; @@ -285,16 +289,25 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) pages = NULL; } - copy = !iov_iter_is_aligned(iter, align, align); - ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); + copy = iov_iter_alignment(iter) & + blk_lim_dma_alignment_and_pad(&q->limits); + + if (blk_queue_pci_p2pdma(q)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, + extraction_flags, &offset); if (unlikely(ret < 0)) goto free_bvec; - nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset, + &is_p2p); if (pages != stack_pages) kvfree(pages); if (nr_bvecs > queue_max_integrity_segments(q)) copy = true; + if (is_p2p) + bio->bi_opf |= REQ_NOMERGE; if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes); diff --git a/block/bio.c b/block/bio.c index 3b371a5da159..3a1a848940dd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST @@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->nr--; put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { - bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } @@ -614,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); @@ -981,7 +985,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; @@ -1227,13 +1231,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; - /* - * Each segment in the iov is required to be a block size multiple. - * However, we may not be able to get the entire segment if it spans - * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the - * result to ensure the bio's total size is correct. The remainder of - * the iov data will be picked up in the next bio iteration. - */ size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); @@ -1241,18 +1238,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - - if (bio->bi_bdev) { - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); - size -= trim; - } - - if (unlikely(!size)) { - ret = -EFAULT; - goto out; - } - for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); @@ -1297,10 +1282,44 @@ out: return ret; } +/* + * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that + * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length + * for the next iteration. + */ +static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) +{ + size_t nbytes = bio->bi_iter.bi_size & len_align_mask; + + if (!nbytes) + return 0; + + iov_iter_revert(iter, nbytes); + bio->bi_iter.bi_size -= nbytes; + do { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (nbytes < bv->bv_len) { + bv->bv_len -= nbytes; + break; + } + + bio_release_page(bio, bv->bv_page); + bio->bi_vcnt--; + nbytes -= bv->bv_len; + } while (nbytes); + + if (!bio->bi_vcnt) + return -EFAULT; + return 0; +} + /** - * bio_iov_iter_get_pages - add user or kernel pages to a bio + * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1317,7 +1336,8 @@ out: * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; @@ -1336,9 +1356,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - return bio->bi_vcnt ? 0 : ret; + if (bio->bi_vcnt) + return bio_iov_iter_align_down(bio, iter, len_align_mask); + return ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fe9ebd6a2e14..f93de34fe87d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -110,12 +110,6 @@ static struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -static bool blkcg_policy_enabled(struct request_queue *q, - const struct blkcg_policy *pol) -{ - return pol && test_bit(pol->plid, q->blkcg_pols); -} - static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, @@ -883,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, disk = ctx->bdev->bd_disk; q = disk->queue; - /* - * blkcg_deactivate_policy() requires queue to be frozen, we can grab - * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). - */ - ret = blk_queue_enter(q, 0); - if (ret) - goto fail; - + /* Prevent concurrent with blkcg_deactivate_policy() */ + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -920,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } spin_lock_irq(&q->queue_lock); @@ -957,7 +945,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - blk_queue_exit(q); + mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; return 0; @@ -965,9 +953,8 @@ fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); -fail_exit_queue: - blk_queue_exit(q); -fail: +fail_exit: + mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 81868ad86330..1cce3294634d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -370,11 +370,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) @@ -459,6 +454,12 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } +static inline bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) +{ + return pol && test_bit(pol->plid, q->blkcg_pols); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ @@ -491,7 +492,6 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } diff --git a/block/blk-core.c b/block/blk-core.c index a27185cd8ede..14ae73eebe0d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -539,7 +539,7 @@ static inline void bio_check_ro(struct bio *bio) } } -static noinline int should_fail_bio(struct bio *bio) +int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; @@ -727,10 +727,9 @@ static void __submit_bio_noacct_mq(struct bio *bio) current->bio_list = NULL; } -void submit_bio_noacct_nocheck(struct bio *bio) +void submit_bio_noacct_nocheck(struct bio *bio, bool split) { blk_cgroup_bio_start(bio); - blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); @@ -747,12 +746,16 @@ void submit_bio_noacct_nocheck(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) + if (current->bio_list) { + if (split) + bio_list_add_head(¤t->bio_list[0], bio); + else + bio_list_add(¤t->bio_list[0], bio); + } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { __submit_bio_noacct_mq(bio); - else + } else { __submit_bio_noacct(bio); + } } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, @@ -873,7 +876,7 @@ void submit_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); return; not_supported: diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..86b27f96051a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, - bio_src->bi_opf); + bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; @@ -222,18 +221,14 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) if (++i == BIO_MAX_VECS) break; } - if (num_sectors < bio_sectors(bio)) { - struct bio *split_bio; - split_bio = bio_split(bio, num_sectors, GFP_NOIO, - &crypto_bio_split); - if (IS_ERR(split_bio)) { - bio->bi_status = BLK_STS_RESOURCE; + if (num_sectors < bio_sectors(bio)) { + bio = bio_submit_split_bioset(bio, num_sectors, + &crypto_bio_split); + if (!bio) return false; - } - bio_chain(split_bio, bio); - submit_bio_noacct(bio); - *bio_ptr = split_bio; + + *bio_ptr = bio; } return true; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ce08ad4565e2..9b27963680dc 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -120,64 +120,6 @@ out: NULL); } -/** - * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request to map - * @sglist: target scatterlist - * - * Description: Map the integrity vectors in request into a - * scatterlist. The scatterlist must be big enough to hold all - * elements. I.e. sized using blk_rq_count_integrity_sg() or - * rq->nr_integrity_segments. - */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) -{ - struct bio_vec iv, ivprv = { NULL }; - struct request_queue *q = rq->q; - struct scatterlist *sg = NULL; - struct bio *bio = rq->bio; - unsigned int segments = 0; - struct bvec_iter iter; - int prev = 0; - - bio_for_each_integrity_vec(iv, bio, iter) { - if (prev) { - if (!biovec_phys_mergeable(q, &ivprv, &iv)) - goto new_segment; - if (sg->length + iv.bv_len > queue_max_segment_size(q)) - goto new_segment; - - sg->length += iv.bv_len; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - sg_unmark_end(sg); - sg = sg_next(sg); - } - - sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); - segments++; - } - - prev = 1; - ivprv = iv; - } - - if (sg) - sg_mark_end(sg); - - /* - * Something must have been wrong if the figured number of segment - * is bigger than number of req's physical integrity segments - */ - BUG_ON(segments > rq->nr_integrity_segments); - BUG_ON(segments > queue_max_integrity_segments(q)); - return segments; -} -EXPORT_SYMBOL(blk_rq_map_integrity_sg); - int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..45bd18f68541 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && @@ -750,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { + struct request_queue *q = blkiolat->rqos.disk->queue; unsigned int memflags; memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; + if (enabled) + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q); + else + blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q); blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-map.c b/block/blk-map.c index 23e5d5ebe59e..165f2234f00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); + bio_init_inline(bio, NULL, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; @@ -253,10 +253,11 @@ static void blk_mq_map_bio_put(struct bio *bio) static struct bio *blk_rq_map_bio_alloc(struct request *rq, unsigned int nr_vecs, gfp_t gfp_mask) { + struct block_device *bdev = rq->q->disk ? rq->q->disk->part0 : NULL; struct bio *bio; if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { - bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, + bio = bio_alloc_bioset(bdev, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; @@ -264,7 +265,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + bio_init_inline(bio, bdev, nr_vecs, req_op(rq)); } return bio; } @@ -326,7 +327,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + bio_init_inline(bio, NULL, nr_vecs, op); if (is_vmalloc_addr(data)) { bio->bi_private = data; if (!bio_add_vmalloc(bio, data, len)) { @@ -392,7 +393,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); + bio_init_inline(bio, NULL, nr_pages, op); while (len) { struct page *page; @@ -443,7 +444,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) int ret; /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..37864c5d287e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,34 +104,58 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/* + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + + if (should_fail_bio(bio)) + bio_io_error(bio); + else if (!blk_throtl_bio(bio)) + submit_bio_noacct_nocheck(bio, true); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, split_sectors, &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, @@ -279,25 +303,30 @@ static unsigned int bio_split_alignment(struct bio *bio, } /** - * bio_split_rw_at - check if and where to split a read/write bio + * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors * @max_bytes: [in] maximum number of bytes per bio + * @len_align_mask: [in] length alignment mask for each vector * * Find out if @bio needs to be split to fit the queue limits in @lim and a * maximum size of @max_bytes. Returns a negative error number if @bio can't be * split, 0 if the bio doesn't have to be split, or a positive sector offset if * @bio needs to be split. */ -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes) +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { + if (bv.bv_offset & lim->dma_alignment || + bv.bv_len & len_align_mask) + return -EINVAL; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -339,8 +368,16 @@ split: * Individual bvecs might not be logical block aligned. Round down the * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. + * + * It is possible to submit a bio that can't be split into a valid io: + * there may either be too many discontiguous vectors for the max + * segments limit, or contain virtual boundary gaps without having a + * valid block sized split. A zero byte result means one of those + * conditions occured. */ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); + if (!bytes) + return -EINVAL; /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -350,7 +387,7 @@ split: bio_clear_polled(bio); return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw_at); +EXPORT_SYMBOL_GPL(bio_split_io_at); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 32c65efdda46..4896525b1c05 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef..449950029872 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include <linux/blk-integrity.h> #include <linux/blk-mq-dma.h> #include "blk.h" @@ -10,29 +11,38 @@ struct phys_vec { u32 len; }; -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = bio_integrity(iter->bio)->bip_iter; + iter->bvecs = bio_integrity(iter->bio)->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; + } + return true; +} + +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -42,20 +52,16 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - } + if (!__blk_map_iter_next(iter)) + break; - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; @@ -125,6 +131,72 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static inline void blk_rq_map_iter_init(struct request *rq, + struct blk_map_iter *iter) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + *iter = (struct blk_map_iter) { + .bvecs = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } else if (bio) { + *iter = (struct blk_map_iter) { + .bio = bio, + .bvecs = bio->bi_io_vec, + .iter = bio->bi_iter, + }; + } else { + /* the internal flush request may not have bio attached */ + *iter = (struct blk_map_iter) {}; + } +} + +static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + unsigned int total_len) +{ + struct phys_vec vec; + + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); + iter->status = BLK_STS_OK; + + /* + * Grab the first segment ASAP because we'll need it to check for P2P + * transfers. + */ + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + if (iter->iter.is_integrity) + bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; + else + req->cmd_flags |= REQ_P2PDMA; + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; + } + + if (blk_can_dma_map_iova(req, dma_dev) && + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -150,43 +222,9 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { - unsigned int total_len = blk_rq_payload_bytes(req); - struct phys_vec vec; - - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; - memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); - iter->status = BLK_STS_OK; - - /* - * Grab the first segment ASAP because we'll need it to check for P2P - * transfers. - */ - if (!blk_map_iter_next(req, &iter->iter, &vec)) - return false; - - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } - } - - if (blk_can_dma_map_iova(req, dma_dev) && - dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) - return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); - return blk_dma_map_direct(req, dma_dev, iter, &vec); + blk_rq_map_iter_init(req, &iter->iter); + return blk_dma_map_iter_start(req, dma_dev, state, iter, + blk_rq_payload_bytes(req)); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); @@ -246,16 +284,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { - .bio = rq->bio, - }; + struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - + blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, @@ -275,3 +308,124 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +/** + * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment + * for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are + * provided by the caller and don't need to be initialized. @state needs to be + * stored for use at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr + * and the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +/** + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next integrity mapping after a previous call to + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description + * of the arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request to map + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg() or + * rq->nr_integrity_segments. + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct request_queue *q = rq->q; + struct scatterlist *sg = NULL; + struct bio *bio = rq->bio; + unsigned int segments = 0; + struct phys_vec vec; + + struct blk_map_iter iter = { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + + while (blk_map_iter_next(rq, &iter, &vec)) { + sg = blk_next_sg(&sg, sglist); + sg_set_page(sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + segments++; + } + + if (sg) + sg_mark_end(sg); + + /* + * Something must have been wrong if the figured number of segment + * is bigger than number of req's physical integrity segments + */ + BUG_ON(segments > rq->nr_integrity_segments); + BUG_ON(segments > queue_max_integrity_segments(q)); + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); +#endif diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e2ce4a28e6c9..d06bb137a743 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -454,7 +454,7 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table, } struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues) + unsigned int nr_hw_queues, unsigned int nr_requests) { unsigned int nr_tags; int i; @@ -470,13 +470,8 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, nr_tags * sizeof(struct blk_mq_tags *), gfp); if (!et) return NULL; - /* - * Default to double of smaller one between hw queue_depth and - * 128, since we don't split into sync/async like the old code - * did. Additionally, this is a per-hw queue depth. - */ - et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, - BLKDEV_DEFAULT_RQ); + + et->nr_requests = nr_requests; et->nr_hw_queues = nr_hw_queues; if (blk_mq_is_shared_tags(set->flags)) { @@ -521,7 +516,8 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!et) goto out_unwind; if (xa_insert(et_table, q->id, et, gfp)) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index b554e1d55950..8e21a6b1415d 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -24,7 +24,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues); + unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, @@ -92,4 +92,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } +static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, + unsigned int depth) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + queue_for_each_hw_ctx(q, hctx, i) + sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, + depth); +} + #endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 24656980f443..58ec293373c6 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -34,7 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); @@ -150,9 +149,11 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) return; hctx_for_each_ctx(hctx, ctx, i) - kobject_del(&ctx->kobj); + if (ctx->kobj.state_in_sysfs) + kobject_del(&ctx->kobj); - kobject_del(&hctx->kobj); + if (hctx->kobj.state_in_sysfs) + kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d880c50629d6..c7a4d4b9cc87 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -8,6 +8,9 @@ */ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/kmemleak.h> #include <linux/delay.h> #include "blk.h" @@ -253,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; - unsigned long flags; - spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; - spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -297,15 +297,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. - * @q: Request queue to examine. + * @q: Request queue @hctx is associated with (@hctx->queue). * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. - * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) - * where rq is a pointer to a request. Return true to continue - * iterating tags, false to stop. - * @data: Will be passed as third argument to @fn. + * @fn will be called as follows: @fn(rq, @data) where rq is a + * pointer to a request. Return %true to continue iterating tags; + * %false to stop. + * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ @@ -371,9 +371,9 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @data, - * @reserved) where rq is a pointer to a request. Return true - * to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @data) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ @@ -406,10 +406,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. @@ -424,10 +423,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where + * rq is a pointer to a request. Return true to continue iterating + * tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after @@ -437,7 +435,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; - int i, nr_tags; + int i, nr_tags, srcu_idx; + + srcu_idx = srcu_read_lock(&tagset->tags_srcu); nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; @@ -446,6 +446,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } + srcu_read_unlock(&tagset->tags_srcu, srcu_idx); } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); @@ -483,11 +484,10 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request - * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, - * reserved) where rq is a pointer to a request and hctx points - * to the hardware queue associated with the request. 'reserved' - * indicates whether or not @rq is a reserved request. - * @priv: Will be passed as third argument to @fn. + * on @q. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request and hctx points to the hardware queue + * associated with the request. + * @priv: Will be passed as second argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only @@ -496,6 +496,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { + int srcu_idx; + /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid @@ -504,6 +506,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; + srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu); if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; @@ -533,6 +536,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(hctx, q, btags, fn, priv, false); } } + srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx); blk_queue_exit(q); } @@ -562,6 +566,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + INIT_LIST_HEAD(&tags->page_list); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) goto out_free_tags; if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) @@ -576,63 +582,37 @@ out_free_tags: return NULL; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +static void blk_mq_free_tags_callback(struct rcu_head *head) { - sbitmap_queue_free(&tags->bitmap_tags); - sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); -} - -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tagsptr, unsigned int tdepth, - bool can_grow) -{ - struct blk_mq_tags *tags = *tagsptr; - - if (tdepth <= tags->nr_reserved_tags) - return -EINVAL; - - /* - * If we are allowed to grow beyond the original size, allocate - * a new set of tags before freeing the old one. - */ - if (tdepth > tags->nr_tags) { - struct blk_mq_tag_set *set = hctx->queue->tag_set; - struct blk_mq_tags *new; - - if (!can_grow) - return -EINVAL; - - /* - * We need some sort of upper limit, set it high enough that - * no valid use cases should require more. - */ - if (tdepth > MAX_SCHED_RQ) - return -EINVAL; + struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, + rcu_head); + struct page *page; + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); /* - * Only the sbitmap needs resizing since we allocated the max - * initially. + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). */ - if (blk_mq_is_shared_tags(set->flags)) - return 0; + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + kfree(tags); +} - new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); - if (!new) - return -ENOMEM; +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) +{ + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); - blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); - *tagsptr = new; - } else { - /* - * Don't need (or can't) update reserved tags here, they - * remain static and should never need resizing. - */ - sbitmap_queue_resize(&tags->bitmap_tags, - tdepth - tags->nr_reserved_tags); + /* if tags pages is not allocated yet, free tags directly */ + if (list_empty(&tags->page_list)) { + kfree(tags); + return; } - return 0; + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) diff --git a/block/blk-mq.c b/block/blk-mq.c index ba3a4b77f578..09f579414161 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,6 +396,15 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { @@ -3168,6 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; @@ -3415,7 +3425,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; - unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized @@ -3439,22 +3448,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, } } } - - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&drv_tags->lock, flags); - spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3478,27 +3477,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, @@ -3560,7 +3552,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, err_free_rqs: kfree(tags->rqs); err_free_tags: - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); return NULL; } @@ -3590,8 +3582,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - INIT_LIST_HEAD(&tags->page_list); - /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size @@ -3678,8 +3668,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) struct rq_iter_data data = { .hctx = hctx, }; + int srcu_idx; + srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); + return data.has_rq; } @@ -3899,7 +3893,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; - unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) @@ -3909,15 +3902,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); +} - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&tags->lock, flags); - spin_unlock_irqrestore(&tags->lock, flags); +static void blk_free_flush_queue_callback(struct rcu_head *head) +{ + struct blk_flush_queue *fq = + container_of(head, struct blk_flush_queue, rcu_head); + + blk_free_flush_queue(fq); } /* hctx->ctxs will be freed in queue's release handler */ @@ -3939,6 +3931,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, + blk_free_flush_queue_callback); + hctx->fq = NULL; + xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); @@ -3964,13 +3960,19 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); + if (!hctx->fq) + goto fail; + hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto fail; + goto fail_free_fq; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) @@ -3987,6 +3989,9 @@ static int blk_mq_init_hctx(struct request_queue *q, exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + fail_free_fq: + blk_free_flush_queue(hctx->fq); + hctx->fq = NULL; fail: return -1; } @@ -4038,16 +4043,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); - if (!hctx->fq) - goto free_bitmap; - blk_mq_hctx_kobj_init(hctx); return hctx; - free_bitmap: - sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: @@ -4101,7 +4100,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); return NULL; } @@ -4129,7 +4128,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); } } @@ -4828,6 +4827,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); @@ -4836,7 +4838,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4865,6 +4867,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4910,6 +4914,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); @@ -4917,57 +4924,59 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *et, + unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *old_et = NULL; struct blk_mq_hw_ctx *hctx; - int ret; unsigned long i; - if (WARN_ON_ONCE(!q->mq_freeze_depth)) - return -EINVAL; - - if (!set) - return -EINVAL; - - if (q->nr_requests == nr) - return 0; - blk_mq_quiesce_queue(q); - ret = 0; - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->tags) - continue; + if (blk_mq_is_shared_tags(set->flags)) { /* - * If we're using an MQ scheduler, just update the scheduler - * queue depth. This is similar to what the old code would do. + * Shared tags, for sched tags, we allocate max initially hence + * tags can't grow, see blk_mq_alloc_sched_tags(). */ - if (hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - } else { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } else if (!q->elevator) { + /* + * Non-shared hardware tags, nr is already checked from + * queue_requests_store() and tags can't grow. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; + sbitmap_queue_resize(&hctx->tags->bitmap_tags, + nr - hctx->tags->nr_reserved_tags); } - if (ret) - break; - if (q->elevator && q->elevator->type->ops.depth_updated) - q->elevator->type->ops.depth_updated(hctx); - } - if (!ret) { - q->nr_requests = nr; - if (blk_mq_is_shared_tags(set->flags)) { - if (q->elevator) - blk_mq_tag_update_sched_shared_tags(q); - else - blk_mq_tag_resize_shared_tags(set, nr); + } else if (nr <= q->elevator->et->nr_requests) { + /* Non-shared sched tags, and tags don't grow. */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, + nr - hctx->sched_tags->nr_reserved_tags); } + } else { + /* Non-shared sched tags, and tags grow */ + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = et->tags[i]; + old_et = q->elevator->et; + q->elevator->et = et; } - blk_mq_unquiesce_queue(q); + q->nr_requests = nr; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); - return ret; + blk_mq_unquiesce_queue(q); + return old_et; } /* diff --git a/block/blk-mq.h b/block/blk-mq.h index affb2e14b56e..af42dc018808 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -6,6 +6,7 @@ #include "blk-stat.h" struct blk_mq_tag_set; +struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; @@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *tags, + unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); @@ -59,7 +62,7 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags); +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, @@ -110,6 +113,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, } /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ +static inline unsigned int blk_mq_default_nr_requests( + struct blk_mq_tag_set *set) +{ + return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); +} + +/* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); @@ -162,7 +176,7 @@ struct blk_mq_alloc_data { struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); -void blk_mq_free_tags(struct blk_mq_tags *tags); +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, @@ -170,8 +184,6 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); diff --git a/block/blk-settings.c b/block/blk-settings.c index d6438e6c276d..54cffaae4df4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -56,6 +56,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_user_wzeroes_unmap_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; + lim->atomic_write_hw_max = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -223,6 +224,27 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) lim->atomic_write_hw_boundary >> SECTOR_SHIFT; } +/* + * Test whether any boundary is aligned with any chunk size. Stacked + * devices store any stripe size in t->chunk_sectors. + */ +static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors, + unsigned int boundary_sectors) +{ + if (!chunk_sectors || !boundary_sectors) + return true; + + if (boundary_sectors > chunk_sectors && + boundary_sectors % chunk_sectors) + return false; + + if (chunk_sectors > boundary_sectors && + chunk_sectors % boundary_sectors) + return false; + + return true; +} + static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; @@ -232,6 +254,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; + /* UINT_MAX indicates stacked limits in initial state */ + if (lim->atomic_write_hw_max == UINT_MAX) + goto unsupported; + if (!lim->atomic_write_hw_max) goto unsupported; @@ -259,20 +285,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (WARN_ON_ONCE(lim->atomic_write_hw_max > lim->atomic_write_hw_boundary)) goto unsupported; - /* - * A feature of boundary support is that it disallows bios to - * be merged which would result in a merged request which - * crosses either a chunk sector or atomic write HW boundary, - * even though chunk sectors may be just set for performance. - * For simplicity, disallow atomic writes for a chunk sector - * which is non-zero and smaller than atomic write HW boundary. - * Furthermore, chunk sectors must be a multiple of atomic - * write HW boundary. Otherwise boundary support becomes - * complicated. - * Devices which do not conform to these rules can be dealt - * with if and when they show up. - */ - if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) + + if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary( + lim->chunk_sectors, boundary_sectors))) goto unsupported; /* @@ -639,25 +654,6 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t, return true; } -/* Check for valid boundary of first bottom device */ -static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, - struct queue_limits *b) -{ - /* - * Ensure atomic write boundary is aligned with chunk sectors. Stacked - * devices store chunk sectors in t->io_min. - */ - if (b->atomic_write_hw_boundary > t->io_min && - b->atomic_write_hw_boundary % t->io_min) - return false; - if (t->io_min > b->atomic_write_hw_boundary && - t->io_min % b->atomic_write_hw_boundary) - return false; - - t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; - return true; -} - static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { unsigned int chunk_bytes; @@ -695,13 +691,14 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) static bool blk_stack_atomic_writes_head(struct queue_limits *t, struct queue_limits *b) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) + if (!blk_valid_atomic_writes_boundary(t->chunk_sectors, + b->atomic_write_hw_boundary >> SECTOR_SHIFT)) return false; t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; t->atomic_write_hw_max = b->atomic_write_hw_max; + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; return true; } @@ -717,18 +714,14 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_atomic_write_start_sect_aligned(start, b)) goto unsupported; - /* - * If atomic_write_hw_max is set, we have already stacked 1x bottom - * device, so check for compliance. - */ - if (t->atomic_write_hw_max) { + /* UINT_MAX indicates no stacking of bottom devices yet */ + if (t->atomic_write_hw_max == UINT_MAX) { + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + } else { if (!blk_stack_atomic_writes_tail(t, b)) goto unsupported; - return; } - - if (!blk_stack_atomic_writes_head(t, b)) - goto unsupported; blk_stack_atomic_writes_chunk_sectors(t); return; @@ -763,7 +756,8 @@ unsupported: int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { - unsigned int top, bottom, alignment, ret = 0; + unsigned int top, bottom, alignment; + int ret = 0; t->features |= (b->features & BLK_FEAT_INHERIT_MASK); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4a7f1a349998..76c47fe9b8d6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -64,28 +64,66 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { - unsigned long nr; - int ret, err; - unsigned int memflags; struct request_queue *q = disk->queue; - - if (!queue_is_mq(q)) - return -EINVAL; + struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *et = NULL; + unsigned int memflags; + unsigned long nr; + int ret; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; - memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); + /* + * Serialize updating nr_requests with concurrent queue_requests_store() + * and switching elevator. + */ + down_write(&set->update_nr_hwq_lock); + + if (nr == q->nr_requests) + goto unlock; + if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - err = blk_mq_update_nr_requests(disk->queue, nr); - if (err) - ret = err; + /* + * Switching elevator is protected by update_nr_hwq_lock: + * - read lock is held from elevator sysfs attribute; + * - write lock is held from updating nr_hw_queues; + * Hence it's safe to access q->elevator here with write lock held. + */ + if (nr <= set->reserved_tags || + (q->elevator && nr > MAX_SCHED_RQ) || + (!q->elevator && nr > set->queue_depth)) { + ret = -EINVAL; + goto unlock; + } + + if (!blk_mq_is_shared_tags(set->flags) && q->elevator && + nr > q->elevator->et->nr_requests) { + /* + * Tags will grow, allocate memory before freezing queue to + * prevent deadlock. + */ + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); + if (!et) { + ret = -ENOMEM; + goto unlock; + } + } + + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + et = blk_mq_update_nr_requests(q, et, nr); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + + if (et) + blk_mq_free_sched_tags(et, set); + +unlock: + up_write(&set->update_nr_hwq_lock); return ret; } @@ -620,6 +658,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ memflags = blk_mq_freeze_queue(q); rqos = wbt_rq_qos(q); @@ -638,11 +681,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (wbt_get_min_lat(q) == val) goto out; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ blk_mq_quiesce_queue(q); mutex_lock(&disk->rqos_state_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 397b6a410f9e..2c5b64b1a724 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); blk_finish_plug(&plug); } } @@ -1327,17 +1327,13 @@ static int blk_throtl_init(struct gendisk *disk) INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); - /* - * Freeze queue before activating policy, to synchronize with IO path, - * which is protected by 'q_usage_counter'. - */ memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; - /* activate policy */ + /* activate policy, blk_throtl_activated() will return true */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; @@ -1846,12 +1842,15 @@ void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - if (!blk_throtl_activated(q)) + /* + * blkg_destroy_all() already deactivate throtl policy, just check and + * free throtl data. + */ + if (!q->td) return; timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 3b27755bfbff..9d7a42c039a1 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -156,7 +156,13 @@ void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_throtl_activated(struct request_queue *q) { - return q->td != NULL; + /* + * q->td guarantees that the blk-throttle module is already loaded, + * and the plid of blk-throttle is assigned. + * blkcg_policy_enabled() guarantees that the policy is activated + * in the request_queue. + */ + return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl); } static inline bool blk_should_throtl(struct bio *bio) @@ -164,11 +170,6 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg; int rw = bio_data_dir(bio); - /* - * This is called under bio_queue_enter(), and it's synchronized with - * the activation of blk-throtl, which is protected by - * blk_mq_freeze_queue(). - */ if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) return false; @@ -194,7 +195,10 @@ static inline bool blk_should_throtl(struct bio *bio) static inline bool blk_throtl_bio(struct bio *bio) { - + /* + * block throttling takes effect if the policy is activated + * in the bio's request_queue. + */ if (!blk_should_throtl(bio)) return false; diff --git a/block/blk.h b/block/blk.h index 46f566f9b126..170794632135 100644 --- a/block/blk.h +++ b/block/blk.h @@ -41,6 +41,7 @@ struct blk_flush_queue { struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; + struct rcu_head rcu_head; }; bool is_flush_rq(struct request *req); @@ -54,7 +55,7 @@ bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); -void submit_bio_noacct_nocheck(struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio, bool split); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) @@ -615,6 +616,7 @@ extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); +int should_fail_bio(struct bio *bio); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ @@ -680,48 +682,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/block/elevator.c b/block/elevator.c index fe96c6f4753c..e2ebfbf107b3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -669,7 +669,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!ctx->et) return -ENOMEM; } diff --git a/block/elevator.h b/block/elevator.h index adc5c157e17e..c4d20155065e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -37,7 +37,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*depth_updated)(struct blk_mq_hw_ctx *); + void (*depth_updated)(struct request_queue *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); diff --git a/block/fops.c b/block/fops.c index ddbc69c0922b..c2c0396ea9ee 100644 --- a/block/fops.c +++ b/block/fops.c @@ -39,8 +39,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || - !bdev_iter_is_aligned(bdev, iter); + return (iocb->ki_pos | iov_iter_count(iter)) & + (bdev_logical_block_size(bdev) - 1); } #define DIO_INLINE_BIO_VECS 4 @@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_pages(&bio, iter); + ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/block/ioctl.c b/block/ioctl.c index c9ea8e53871e..d7489a56b33c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev, */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) @@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, * want to override it. */ geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 70cbc7b2deb4..18efd6ef2a2b 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -399,6 +399,14 @@ err: return ERR_PTR(ret); } +static void kyber_depth_updated(struct request_queue *q) +{ + struct kyber_queue_data *kqd = q->elevator->elevator_data; + + kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; + blk_mq_set_min_shallow_depth(q, kqd->async_depth); +} + static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; @@ -413,6 +421,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) eq->elevator_data = kqd; q->elevator = eq; + kyber_depth_updated(q); return 0; } @@ -440,15 +449,6 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); -} - static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct kyber_hctx_data *khd; @@ -493,7 +493,6 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - kyber_depth_updated(hctx); return 0; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b9b7cdf1d3c9..3e741d33142d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -136,10 +136,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; - if (!node) - return NULL; - - rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -507,22 +503,12 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) } /* Called by blk_mq_update_nr_requests(). */ -static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +static void dd_depth_updated(struct request_queue *q) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; - - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ -static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - dd_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void dd_exit_sched(struct elevator_queue *e) @@ -587,6 +573,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + dd_depth_updated(q); return 0; } @@ -1048,7 +1035,6 @@ static struct elevator_type mq_deadline = { .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, - .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 82d9c4c3fb41..631291fbb356 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state) goto out_nolab; /* set start if not filled by getgeo function e.g. virtblk */ geo->start = get_start_sect(bdev); - if (disk->fops->getgeo(bdev, geo)) + if (disk->fops->getgeo(disk, geo)) goto out_freeall; if (!fn || fn(disk, info)) { kfree(info); diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2ded5e476d6e..b43a3196e2be 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); /** * ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd. * @sdev: SCSI device for which BIOS geometry is to be determined - * @bdev: block device associated with @sdev + * @unused: gendisk associated with @sdev * @capacity: capacity of SCSI device * @geom: location to which geometry will be output * @@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); * RETURNS: * Zero. */ -int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, +int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { geom[0] = 255; diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index df38fb364904..77d694448990 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -17,6 +17,7 @@ menuconfig BLK_DEV if BLK_DEV source "drivers/block/null_blk/Kconfig" +source "drivers/block/rnull/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" @@ -311,15 +312,6 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. -config BLK_DEV_RUST_NULL - tristate "Rust null block driver (Experimental)" - depends on RUST - help - This is the Rust implementation of the null block driver. For now it - is only a minimal stub. - - If unsure, say N. - config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a695ce74ef22..2d8096eb8cdf 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,9 +9,6 @@ # needed for trace events ccflags-y += -I$(src) -obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o -rnull_mod-y := rnull.o - obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o @@ -38,6 +35,7 @@ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/ obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 6357d86eafdc..2932b6653b6f 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = MINOR(bdev->bd_dev) & 3; + struct amiga_floppy_struct *p = disk->private_data; - geo->heads = unit[drive].type->heads; - geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult; - geo->cylinders = unit[drive].type->tracks; + geo->heads = p->type->heads; + geo->sectors = p->dtype->sects * p->type->sect_mult; + geo->cylinders = p->type->tracks; return 0; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 00b74a845328..34ead75e7e02 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx, } static int -aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct aoedev *d = bdev->bd_disk->private_data; + struct aoedev *d = disk->private_data; if ((d->flags & DEVFL_UP) == 0) { printk(KERN_ERR "aoe: disk not up\n"); diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index cdf6e4041bb9..3b21750038ee 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -44,7 +44,7 @@ aoe_init(void) { int ret; - aoe_wq = alloc_workqueue("aoe_wq", 0, 0); + aoe_wq = alloc_workqueue("aoe_wq", WQ_PERCPU, 0); if (!aoe_wq) return -ENOMEM; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 0c2eabe14af3..9778259b30d4 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -44,45 +44,74 @@ struct brd_device { }; /* - * Look up and return a brd's page for a given sector. + * Look up and return a brd's page with reference grabbed for a given sector. */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); + struct page *page; + XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); + + rcu_read_lock(); +repeat: + page = xas_load(&xas); + if (xas_retry(&xas, page)) { + xas_reset(&xas); + goto repeat; + } + + if (!page) + goto out; + + if (!get_page_unless_zero(page)) { + xas_reset(&xas); + goto repeat; + } + + if (unlikely(page != xas_reload(&xas))) { + put_page(page); + xas_reset(&xas); + goto repeat; + } +out: + rcu_read_unlock(); + + return page; } /* * Insert a new page for a given sector, if one does not already exist. + * The returned page will grab reference. */ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, blk_opf_t opf) - __releases(rcu) - __acquires(rcu) { gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; struct page *page, *ret; - rcu_read_unlock(); page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); - if (!page) { - rcu_read_lock(); + if (!page) return ERR_PTR(-ENOMEM); - } xa_lock(&brd->brd_pages); ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, page, gfp); - rcu_read_lock(); - if (ret) { + if (!ret) { + brd->brd_nr_pages++; + get_page(page); + xa_unlock(&brd->brd_pages); + return page; + } + + if (!xa_is_err(ret)) { + get_page(ret); xa_unlock(&brd->brd_pages); - __free_page(page); - if (xa_is_err(ret)) - return ERR_PTR(xa_err(ret)); + put_page(page); return ret; } - brd->brd_nr_pages++; + xa_unlock(&brd->brd_pages); - return page; + put_page(page); + return ERR_PTR(xa_err(ret)); } /* @@ -95,7 +124,7 @@ static void brd_free_pages(struct brd_device *brd) pgoff_t idx; xa_for_each(&brd->brd_pages, idx, page) { - __free_page(page); + put_page(page); cond_resched(); } @@ -117,7 +146,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); - rcu_read_lock(); page = brd_lookup_page(brd, sector); if (!page && op_is_write(opf)) { page = brd_insert_page(brd, sector, opf); @@ -135,13 +163,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) memset(kaddr, 0, bv.bv_len); } kunmap_local(kaddr); - rcu_read_unlock(); bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); + if (page) + put_page(page); return true; out_error: - rcu_read_unlock(); if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else @@ -149,13 +177,6 @@ out_error: return false; } -static void brd_free_one_page(struct rcu_head *head) -{ - struct page *page = container_of(head, struct page, rcu_head); - - __free_page(page); -} - static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { sector_t aligned_sector = round_up(sector, PAGE_SECTORS); @@ -170,7 +191,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { - call_rcu(&page->rcu_head, brd_free_one_page); + put_page(page); brd->brd_nr_pages--; } aligned_sector += PAGE_SECTORS; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 24be0c2c4075..5336c3c5ca36 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -163,35 +163,35 @@ /* do print messages for unexpected interrupts */ static int print_unex = 1; -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/workqueue.h> -#include <linux/fdreg.h> -#include <linux/fd.h> -#include <linux/hdreg.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/mm.h> +#include <linux/async.h> #include <linux/bio.h> -#include <linux/string.h> -#include <linux/jiffies.h> -#include <linux/fcntl.h> +#include <linux/compat.h> #include <linux/delay.h> -#include <linux/mc146818rtc.h> /* CMOS defines */ -#include <linux/ioport.h> -#include <linux/interrupt.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/fd.h> +#include <linux/fdreg.h> +#include <linux/fs.h> +#include <linux/hdreg.h> #include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> #include <linux/major.h> -#include <linux/platform_device.h> +#include <linux/mc146818rtc.h> /* CMOS defines */ +#include <linux/mm.h> #include <linux/mod_devicetable.h> +#include <linux/module.h> #include <linux/mutex.h> -#include <linux/io.h> +#include <linux/platform_device.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/timer.h> #include <linux/uaccess.h> -#include <linux/async.h> -#include <linux/compat.h> +#include <linux/workqueue.h> /* * PS/2 floppies have much slower step rates than regular floppies. @@ -233,8 +233,6 @@ static unsigned short virtual_dma_port = 0x3f0; irqreturn_t floppy_interrupt(int irq, void *dev_id); static int set_dor(int fdc, char mask, char data); -#define K_64 0x10000 /* 64KB */ - /* the following is the mask of allowed drives. By default units 2 and * 3 of both floppy controllers are disabled, because switching on the * motor of these drives causes system hangs on some PCI computers. drive @@ -3092,16 +3090,13 @@ static int raw_cmd_copyin(int cmd, void __user *param, *rcmd = NULL; loop: - ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL); - if (!ptr) - return -ENOMEM; + ptr = memdup_user(param, sizeof(*ptr)); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); *rcmd = ptr; - ret = copy_from_user(ptr, param, sizeof(*ptr)); ptr->next = NULL; ptr->buffer_length = 0; ptr->kernel_data = NULL; - if (ret) - return -EFAULT; param += sizeof(struct floppy_raw_cmd); if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE) return -EINVAL; @@ -3363,9 +3358,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) return 0; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = (long)bdev->bd_disk->private_data; + int drive = (long)disk->private_data; int type = ITYPE(drive_state[drive].fd_device); struct floppy_struct *g; int ret; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8fc7761397bd..567192e371a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev, * that each partition is also 4KB aligned. Non-aligned partitions adversely * affects performance. * - * @dev Pointer to the block_device strucutre. + * @disk Pointer to the gendisk strucutre. * @geo Pointer to a hd_geometry structure. * * return value * 0 Operation completed successfully. * -ENOTTY An error occurred while reading the drive capacity. */ -static int mtip_block_getgeo(struct block_device *dev, +static int mtip_block_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct driver_data *dd = dev->bd_disk->private_data; + struct driver_data *dd = disk->private_data; sector_t capacity; if (!dd) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6463d0e8d0ce..1188f32a5e5e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -311,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (args) { INIT_WORK(&args->work, nbd_dead_link_work); args->index = nbd->index; - queue_work(system_wq, &args->work); + queue_work(system_percpu_wq, &args->work); } } if (!nsock->dead) { @@ -1217,6 +1217,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, if (!sock) return NULL; + if (!sk_is_tcp(sock->sk) && + !sk_is_stream_unix(sock->sk)) { + dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n"); + *err = -EINVAL; + sockfd_put(sock); + return NULL; + } + if (sock->ops->shutdown == sock_no_shutdown) { dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); *err = -EINVAL; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 91642c9a3b29..f982027e8c85 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -223,7 +223,7 @@ MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed nu static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); -MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); +MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)"); static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index faafd7ff43d6..af0e21149dbc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7389,7 +7389,7 @@ static int __init rbd_init(void) * The number of active work items is limited by the number of * rbd devices * queue depth, so leave @max_active at default. */ - rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); + rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rbd_wq) { rc = -ENOMEM; goto err_out_slab; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 15627417f12e..f1409e54010a 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen) rnbd_clt_put_dev(dev); } -static int rnbd_client_getgeo(struct block_device *block_device, +static int rnbd_client_getgeo(struct gendisk *disk, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct rnbd_clt_dev *dev = disk->private_data; struct queue_limits *limit = &dev->queue->limits; size = dev->size * (limit->logical_block_size / SECTOR_SIZE); @@ -1809,7 +1809,7 @@ static int __init rnbd_client_init(void) unregister_blkdev(rnbd_client_major, "rnbd"); return err; } - rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0); + rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", WQ_PERCPU, 0); if (!rnbd_clt_wq) { pr_err("Failed to load module, alloc_workqueue failed.\n"); rnbd_clt_destroy_sysfs_files(); diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs deleted file mode 100644 index 6366da12c5a5..000000000000 --- a/drivers/block/rnull.rs +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -//! This is a Rust implementation of the C null block driver. -//! -//! Supported features: -//! -//! - blk-mq interface -//! - direct completion -//! - block size 4k -//! -//! The driver is not configurable. - -use kernel::{ - alloc::flags, - block::mq::{ - self, - gen_disk::{self, GenDisk}, - Operations, TagSet, - }, - error::Result, - new_mutex, pr_info, - prelude::*, - sync::{Arc, Mutex}, - types::ARef, -}; - -module! { - type: NullBlkModule, - name: "rnull_mod", - authors: ["Andreas Hindborg"], - description: "Rust implementation of the C null block driver", - license: "GPL v2", -} - -#[pin_data] -struct NullBlkModule { - #[pin] - _disk: Mutex<GenDisk<NullBlkDevice>>, -} - -impl kernel::InPlaceModule for NullBlkModule { - fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> { - pr_info!("Rust null_blk loaded\n"); - - // Use a immediately-called closure as a stable `try` block - let disk = /* try */ (|| { - let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; - - gen_disk::GenDiskBuilder::new() - .capacity_sectors(4096 << 11) - .logical_block_size(4096)? - .physical_block_size(4096)? - .rotational(false) - .build(fmt!("rnullb{}", 0), tagset) - })(); - - try_pin_init!(Self { - _disk <- new_mutex!(disk?, "nullb:disk"), - }) - } -} - -struct NullBlkDevice; - -#[vtable] -impl Operations for NullBlkDevice { - #[inline(always)] - fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result { - mq::Request::end_ok(rq) - .map_err(|_e| kernel::error::code::EIO) - // We take no refcounts on the request, so we expect to be able to - // end the request. The request reference must be unique at this - // point, and so `end_ok` cannot fail. - .expect("Fatal error - expected to be able to end request"); - - Ok(()) - } - - fn commit_rqs() {} -} diff --git a/drivers/block/rnull/Kconfig b/drivers/block/rnull/Kconfig new file mode 100644 index 000000000000..7bc5b376c128 --- /dev/null +++ b/drivers/block/rnull/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Rust null block device driver configuration + +config BLK_DEV_RUST_NULL + tristate "Rust null block driver (Experimental)" + depends on RUST && CONFIGFS_FS + help + This is the Rust implementation of the null block driver. Like + the C version, the driver allows the user to create virutal block + devices that can be configured via various configuration options. + + If unsure, say N. diff --git a/drivers/block/rnull/Makefile b/drivers/block/rnull/Makefile new file mode 100644 index 000000000000..11cfa5e615dc --- /dev/null +++ b/drivers/block/rnull/Makefile @@ -0,0 +1,3 @@ + +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o +rnull_mod-y := rnull.o diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs new file mode 100644 index 000000000000..8498e9bae6fd --- /dev/null +++ b/drivers/block/rnull/configfs.rs @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +use super::{NullBlkDevice, THIS_MODULE}; +use core::fmt::{Display, Write}; +use kernel::{ + block::mq::gen_disk::{GenDisk, GenDiskBuilder}, + c_str, + configfs::{self, AttributeOperations}, + configfs_attrs, new_mutex, + page::PAGE_SIZE, + prelude::*, + str::{kstrtobool_bytes, CString}, + sync::Mutex, +}; +use pin_init::PinInit; + +pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> { + let item_type = configfs_attrs! { + container: configfs::Subsystem<Config>, + data: Config, + child: DeviceConfig, + attributes: [ + features: 0, + ], + }; + + kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {})) +} + +#[pin_data] +pub(crate) struct Config {} + +#[vtable] +impl AttributeOperations<0> for Config { + type Data = Config; + + fn show(_this: &Config, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + writer.write_str("blocksize,size,rotational,irqmode\n")?; + Ok(writer.bytes_written()) + } +} + +#[vtable] +impl configfs::GroupOperations for Config { + type Child = DeviceConfig; + + fn make_group( + &self, + name: &CStr, + ) -> Result<impl PinInit<configfs::Group<DeviceConfig>, Error>> { + let item_type = configfs_attrs! { + container: configfs::Group<DeviceConfig>, + data: DeviceConfig, + attributes: [ + // Named for compatibility with C null_blk + power: 0, + blocksize: 1, + rotational: 2, + size: 3, + irqmode: 4, + ], + }; + + Ok(configfs::Group::new( + name.try_into()?, + item_type, + // TODO: cannot coerce new_mutex!() to impl PinInit<_, Error>, so put mutex inside + try_pin_init!( DeviceConfig { + data <- new_mutex!(DeviceConfigInner { + powered: false, + block_size: 4096, + rotational: false, + disk: None, + capacity_mib: 4096, + irq_mode: IRQMode::None, + name: name.try_into()?, + }), + }), + )) + } +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum IRQMode { + None, + Soft, +} + +impl TryFrom<u8> for IRQMode { + type Error = kernel::error::Error; + + fn try_from(value: u8) -> Result<Self> { + match value { + 0 => Ok(Self::None), + 1 => Ok(Self::Soft), + _ => Err(EINVAL), + } + } +} + +impl Display for IRQMode { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::None => f.write_str("0")?, + Self::Soft => f.write_str("1")?, + } + Ok(()) + } +} + +#[pin_data] +pub(crate) struct DeviceConfig { + #[pin] + data: Mutex<DeviceConfigInner>, +} + +#[pin_data] +struct DeviceConfigInner { + powered: bool, + name: CString, + block_size: u32, + rotational: bool, + capacity_mib: u64, + irq_mode: IRQMode, + disk: Option<GenDisk<NullBlkDevice>>, +} + +#[vtable] +impl configfs::AttributeOperations<0> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + + if this.data.lock().powered { + writer.write_str("1\n")?; + } else { + writer.write_str("0\n")?; + } + + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + let power_op = kstrtobool_bytes(page)?; + let mut guard = this.data.lock(); + + if !guard.powered && power_op { + guard.disk = Some(NullBlkDevice::new( + &guard.name, + guard.block_size, + guard.rotational, + guard.capacity_mib, + guard.irq_mode, + )?); + guard.powered = true; + } else if guard.powered && !power_op { + drop(guard.disk.take()); + guard.powered = false; + } + + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<1> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().block_size))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::<u32>().map_err(|_| EINVAL)?; + + GenDiskBuilder::validate_block_size(value)?; + this.data.lock().block_size = value; + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<2> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + + if this.data.lock().rotational { + writer.write_str("1\n")?; + } else { + writer.write_str("0\n")?; + } + + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + this.data.lock().rotational = kstrtobool_bytes(page)?; + + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<3> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().capacity_mib))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::<u64>().map_err(|_| EINVAL)?; + + this.data.lock().capacity_mib = value; + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<4> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().irq_mode))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::<u8>().map_err(|_| EINVAL)?; + + this.data.lock().irq_mode = IRQMode::try_from(value)?; + Ok(()) + } +} diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs new file mode 100644 index 000000000000..1ec694d7f1a6 --- /dev/null +++ b/drivers/block/rnull/rnull.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This is a Rust implementation of the C null block driver. + +mod configfs; + +use configfs::IRQMode; +use kernel::{ + block::{ + self, + mq::{ + self, + gen_disk::{self, GenDisk}, + Operations, TagSet, + }, + }, + error::Result, + pr_info, + prelude::*, + sync::Arc, + types::ARef, +}; +use pin_init::PinInit; + +module! { + type: NullBlkModule, + name: "rnull_mod", + authors: ["Andreas Hindborg"], + description: "Rust implementation of the C null block driver", + license: "GPL v2", +} + +#[pin_data] +struct NullBlkModule { + #[pin] + configfs_subsystem: kernel::configfs::Subsystem<configfs::Config>, +} + +impl kernel::InPlaceModule for NullBlkModule { + fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> { + pr_info!("Rust null_blk loaded\n"); + + try_pin_init!(Self { + configfs_subsystem <- configfs::subsystem(), + }) + } +} + +struct NullBlkDevice; + +impl NullBlkDevice { + fn new( + name: &CStr, + block_size: u32, + rotational: bool, + capacity_mib: u64, + irq_mode: IRQMode, + ) -> Result<GenDisk<Self>> { + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), GFP_KERNEL)?; + + let queue_data = Box::new(QueueData { irq_mode }, GFP_KERNEL)?; + + gen_disk::GenDiskBuilder::new() + .capacity_sectors(capacity_mib << (20 - block::SECTOR_SHIFT)) + .logical_block_size(block_size)? + .physical_block_size(block_size)? + .rotational(rotational) + .build(fmt!("{}", name.to_str()?), tagset, queue_data) + } +} + +struct QueueData { + irq_mode: IRQMode, +} + +#[vtable] +impl Operations for NullBlkDevice { + type QueueData = KBox<QueueData>; + + #[inline(always)] + fn queue_rq(queue_data: &QueueData, rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result { + match queue_data.irq_mode { + IRQMode::None => mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"), + IRQMode::Soft => mq::Request::complete(rq), + } + Ok(()) + } + + fn commit_rqs(_queue_data: &QueueData) {} + + fn complete(rq: ARef<mq::Request<Self>>) { + mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"); + } +} diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7af21fe67671..db1fe9772a4d 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) return vio_dring_avail(dr, VDC_TX_RING_SIZE); } -static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct gendisk *disk = bdev->bd_disk; sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; @@ -1189,7 +1188,7 @@ static void vdc_ldc_reset(struct vdc_port *port) } if (port->ldc_timeout) - mod_delayed_work(system_wq, &port->ldc_reset_timer_work, + mod_delayed_work(system_percpu_wq, &port->ldc_reset_timer_work, round_jiffies(jiffies + HZ * port->ldc_timeout)); mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); return; @@ -1217,7 +1216,7 @@ static int __init vdc_init(void) { int err; - sunvdc_wq = alloc_workqueue("sunvdc", 0, 0); + sunvdc_wq = alloc_workqueue("sunvdc", WQ_PERCPU, 0); if (!sunvdc_wq) return -ENOMEM; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index eda33c5eb5e2..416015947ae6 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } -static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct floppy_struct *g; int ret; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8fdc26a61104..0c74a41a6753 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -201,7 +201,6 @@ struct ublk_queue { bool force_abort; bool canceling; bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ - unsigned short nr_io_ready; /* how many ios setup */ spinlock_t cancel_lock; struct ublk_device *dev; struct ublk_io ios[]; @@ -234,7 +233,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - unsigned int nr_queues_ready; + u32 nr_io_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -252,8 +251,7 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, struct ublk_io *io, - size_t offset); + u16 q_id, u16 tag, struct ublk_io *io, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc * @@ -532,7 +530,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif -static inline void __ublk_complete_rq(struct request *req); +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -664,22 +663,44 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; } +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_AUTO_BUF_REG; } +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_USER_COPY; } +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && !ublk_support_auto_buf_reg(ubq); } +static inline bool ublk_dev_need_map_io(const struct ublk_device *ub) +{ + return !ublk_dev_support_user_copy(ub) && + !ublk_dev_support_zero_copy(ub) && + !ublk_dev_support_auto_buf_reg(ub); +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { /* @@ -697,6 +718,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) ublk_support_auto_buf_reg(ubq); } +static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) +{ + return ublk_dev_support_user_copy(ub) || + ublk_dev_support_zero_copy(ub) || + ublk_dev_support_auto_buf_reg(ub); +} + static inline void ublk_init_req_ref(const struct ublk_queue *ubq, struct ublk_io *io) { @@ -711,8 +739,11 @@ static inline bool ublk_get_req_ref(struct ublk_io *io) static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) { - if (refcount_dec_and_test(&io->ref)) - __ublk_complete_rq(req); + if (!refcount_dec_and_test(&io->ref)) + return; + + /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ + __ublk_complete_rq(req, io, false); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -728,6 +759,11 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_NEED_GET_DATA; } +static inline bool ublk_dev_need_get_data(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_NEED_GET_DATA; +} + /* Called in slow path only, keep it noinline for trace purpose */ static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { @@ -764,11 +800,9 @@ static inline int __ublk_queue_cmd_buf_size(int depth) return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); } -static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) +static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - - return __ublk_queue_cmd_buf_size(ubq->q_depth); + return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth); } static int ublk_max_cmd_buf_size(void) @@ -1019,13 +1053,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, return rq_bytes; } -static int ublk_unmap_io(const struct ublk_queue *ubq, +static int ublk_unmap_io(bool need_map, const struct request *req, const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); - if (!ublk_need_map_io(ubq)) + if (!need_map) return rq_bytes; if (ublk_need_unmap_req(req)) { @@ -1072,13 +1106,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - enum req_op op = req_op(req); u32 ublk_op; - if (!ublk_queue_is_zoned(ubq) && - (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) - return BLK_STS_IOERR; - switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; @@ -1117,10 +1146,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( } /* todo: handle partial completion */ -static inline void __ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1144,7 +1172,7 @@ static inline void __ublk_complete_rq(struct request *req) goto exit; /* for READ request, writing data in iod->addr to rq buffers */ - unmapped_bytes = ublk_unmap_io(ubq, req, io); + unmapped_bytes = ublk_unmap_io(need_map, req, io); /* * Extremely impossible since we got data filled in just before @@ -1500,9 +1528,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; - /* All old ioucmds have to be completed */ - ubq->nr_io_ready = 0; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1551,7 +1576,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; - ub->nr_queues_ready = 0; + ub->nr_io_ready = 0; ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -1775,23 +1800,23 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) __func__, q_id, current->pid, vma->vm_start, phys_off, (unsigned long)sz); - if (sz != ublk_queue_cmd_buf_size(ub, q_id)) + if (sz != ublk_queue_cmd_buf_size(ub)) return -EINVAL; pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } -static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, +static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, struct request *req) { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + if (ublk_nosrv_should_reissue_outstanding(ub)) blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); } } @@ -1811,7 +1836,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) struct ublk_io *io = &ubq->ios[i]; if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - __ublk_fail_req(ubq, io, io->req); + __ublk_fail_req(ub, io, io->req); } } @@ -1916,9 +1941,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } -static inline bool ublk_queue_ready(struct ublk_queue *ubq) +static inline bool ublk_dev_ready(const struct ublk_device *ub) { - return ubq->nr_io_ready == ubq->q_depth; + u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; + + return ub->nr_io_ready == total; } static void ublk_cancel_queue(struct ublk_queue *ubq) @@ -2042,16 +2069,14 @@ static void ublk_reset_io_flags(struct ublk_device *ub) } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) +static void ublk_mark_io_ready(struct ublk_device *ub) __must_hold(&ub->mutex) { - ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) - ub->nr_queues_ready++; if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) ub->unprivileged_daemons = true; - if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { + ub->nr_io_ready++; + if (ublk_dev_ready(ub)) { /* now we are ready for handling ublk io request */ ublk_reset_io_flags(ub); complete_all(&ub->completion); @@ -2122,11 +2147,11 @@ ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) } static inline int -ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, +ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, struct io_uring_cmd *cmd, unsigned long buf_addr, u16 *buf_idx) { - if (ublk_support_auto_buf_reg(ubq)) + if (ublk_dev_support_auto_buf_reg(ub)) return ublk_handle_auto_buf_reg(io, cmd, buf_idx); io->addr = buf_addr; @@ -2165,18 +2190,18 @@ static void ublk_io_release(void *priv) } static int ublk_register_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, + struct ublk_device *ub, + u16 q_id, u16 tag, struct ublk_io *io, unsigned int index, unsigned int issue_flags) { - struct ublk_device *ub = cmd->file->private_data; struct request *req; int ret; - if (!ublk_support_zero_copy(ubq)) + if (!ublk_dev_support_zero_copy(ub)) return -EINVAL; - req = __ublk_check_and_get_req(ub, ubq, io, 0); + req = __ublk_check_and_get_req(ub, q_id, tag, io, 0); if (!req) return -EINVAL; @@ -2192,7 +2217,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, struct ublk_io *io, + struct ublk_device *ub, + u16 q_id, u16 tag, struct ublk_io *io, unsigned index, unsigned issue_flags) { unsigned new_registered_buffers; @@ -2205,9 +2231,10 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, */ new_registered_buffers = io->task_registered_buffers + 1; if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) - return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); - if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) + if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) return -EINVAL; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, @@ -2229,14 +2256,14 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, return io_buffer_unregister_bvec(cmd, index, issue_flags); } -static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) +static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) { - if (ublk_need_map_io(ubq)) { + if (ublk_dev_need_map_io(ub)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled */ - if (!buf_addr && !ublk_need_get_data(ubq)) + if (!buf_addr && !ublk_dev_need_get_data(ub)) return -EINVAL; } else if (buf_addr) { /* User copy requires addr to be unset */ @@ -2245,10 +2272,9 @@ static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) return 0; } -static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, struct ublk_io *io, __u64 buf_addr) { - struct ublk_device *ub = ubq->dev; int ret = 0; /* @@ -2257,8 +2283,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { + /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ + if (ublk_dev_ready(ub)) { ret = -EBUSY; goto out; } @@ -2272,28 +2298,28 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); if (ret) goto out; WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub, ubq); + ublk_mark_io_ready(ub); out: mutex_unlock(&ub->mutex); return ret; } -static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, +static int ublk_check_commit_and_fetch(const struct ublk_device *ub, struct ublk_io *io, __u64 buf_addr) { struct request *req = io->req; - if (ublk_need_map_io(ubq)) { + if (ublk_dev_need_map_io(ub)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. */ - if (!buf_addr && (!ublk_need_get_data(ubq) || + if (!buf_addr && (!ublk_dev_need_get_data(ub) || req_op(req) == REQ_OP_READ)) return -EINVAL; } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { @@ -2307,10 +2333,10 @@ static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, return 0; } -static bool ublk_need_complete_req(const struct ublk_queue *ubq, +static bool ublk_need_complete_req(const struct ublk_device *ub, struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) + if (ublk_dev_need_req_ref(ub)) return ublk_sub_req_ref(io); return true; } @@ -2333,23 +2359,28 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, return ublk_start_io(ubq, req, io); } -static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, - unsigned int issue_flags, - const struct ublksrv_io_cmd *ub_cmd) +static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, + unsigned int issue_flags) { + /* May point to userspace-mapped memory */ + const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; u32 cmd_op = cmd->cmd_op; - unsigned tag = ub_cmd->tag; + u16 q_id = READ_ONCE(ub_src->q_id); + u16 tag = READ_ONCE(ub_src->tag); + s32 result = READ_ONCE(ub_src->result); + u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */ struct request *req; int ret; bool compl; + WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); + pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", - __func__, cmd->cmd_op, ub_cmd->q_id, tag, - ub_cmd->result); + __func__, cmd->cmd_op, q_id, tag, result); ret = ublk_check_cmd_op(cmd_op); if (ret) @@ -2360,25 +2391,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * so no need to validate the q_id, tag, or task */ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) - return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, - issue_flags); + return ublk_unregister_io_buf(cmd, ub, addr, issue_flags); ret = -EINVAL; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + if (q_id >= ub->dev_info.nr_hw_queues) goto out; - ubq = ublk_get_queue(ub, ub_cmd->q_id); + ubq = ublk_get_queue(ub, q_id); - if (tag >= ubq->q_depth) + if (tag >= ub->dev_info.queue_depth) goto out; io = &ubq->ios[tag]; /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { - ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); + ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + ret = ublk_fetch(cmd, ub, io, addr); if (ret) goto out; @@ -2392,8 +2422,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * so can be handled on any task */ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) - return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, - issue_flags); + return ublk_register_io_buf(cmd, ub, q_id, tag, io, + addr, issue_flags); goto out; } @@ -2414,24 +2444,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, + return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); + ret = ublk_check_commit_and_fetch(ub, io, addr); if (ret) goto out; - io->res = ub_cmd->result; + io->res = result; req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); - compl = ublk_need_complete_req(ubq, io); + ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); + compl = ublk_need_complete_req(ub, io); /* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX) io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = ub_cmd->zone_append_lba; + req->__sector = addr; if (compl) - __ublk_complete_rq(req); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); if (ret) goto out; @@ -2443,7 +2473,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * request */ req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); + ret = ublk_config_io_buf(ub, io, cmd, addr, NULL); WARN_ON_ONCE(ret); if (likely(ublk_get_data(ubq, io, req))) { __ublk_prep_compl_io_cmd(io, req); @@ -2463,16 +2493,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) + u16 q_id, u16 tag, struct ublk_io *io, size_t offset) { - unsigned tag = io - ubq->ios; struct request *req; /* * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, * which would overwrite it with io->cmd */ - req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); if (!req) return NULL; @@ -2494,26 +2523,6 @@ fail_put: return NULL; } -static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - /* - * Not necessary for async retry, but let's keep it simple and always - * copy the values to avoid any potential reuse. - */ - const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); - const struct ublksrv_io_cmd ub_cmd = { - .q_id = READ_ONCE(ub_src->q_id), - .tag = READ_ONCE(ub_src->tag), - .result = READ_ONCE(ub_src->result), - .addr = READ_ONCE(ub_src->addr) - }; - - WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); - - return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); -} - static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -2583,17 +2592,14 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, return ERR_PTR(-EINVAL); ubq = ublk_get_queue(ub, q_id); - if (!ubq) - return ERR_PTR(-EINVAL); - - if (!ublk_support_user_copy(ubq)) + if (!ublk_dev_support_user_copy(ub)) return ERR_PTR(-EACCES); - if (tag >= ubq->q_depth) + if (tag >= ub->dev_info.queue_depth) return ERR_PTR(-EINVAL); *io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); + req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off); if (!req) return ERR_PTR(-EINVAL); @@ -2656,7 +2662,7 @@ static const struct file_operations ublk_ch_fops = { static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { - int size = ublk_queue_cmd_buf_size(ub, q_id); + int size = ublk_queue_cmd_buf_size(ub); struct ublk_queue *ubq = ublk_get_queue(ub, q_id); int i; @@ -2683,7 +2689,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; - size = ublk_queue_cmd_buf_size(ub, q_id); + size = ublk_queue_cmd_buf_size(ub); ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); if (!ptr) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e649fa67bac1..f061420dfb10 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -829,9 +829,9 @@ out: } /* We provide getgeo only to please some old bootloader/partitioning tools */ -static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct virtio_blk *vblk = bd->bd_disk->private_data; + struct virtio_blk *vblk = disk->private_data; int ret = 0; mutex_lock(&vblk->vdev_mutex); @@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) /* some standard values, similar to sd */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; } out: mutex_unlock(&vblk->vdev_mutex); @@ -1682,7 +1682,7 @@ static int __init virtio_blk_init(void) { int error; - virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); + virtblk_wq = alloc_workqueue("virtio-blk", WQ_PERCPU, 0); if (!virtblk_wq) return -ENOMEM; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5babe575c288..04fc6b552c04 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg) schedule_work(&rinfo->work); } -static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg) { /* We don't have real geometry info, but let's at least return values consistent with the size of the device */ - sector_t nsect = get_capacity(bd->bd_disk); + sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; hg->heads = 0xff; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f31652085adc..42809cf6dd82 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1085,7 +1085,7 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page, work.entry = entry; INIT_WORK_ONSTACK(&work.work, zram_sync_read); - queue_work(system_unbound_wq, &work.work); + queue_work(system_dfl_wq, &work.work); flush_work(&work.work); destroy_work_on_stack(&work.work); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ddb37f6670de..07c19b2182ca 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -37,6 +37,32 @@ config BLK_DEV_MD If unsure, say N. +config MD_BITMAP + bool "MD RAID bitmap support" + default y + depends on BLK_DEV_MD + help + If you say Y here, support for the write intent bitmap will be + enabled. The bitmap can be used to optimize resync speed after power + failure or readding a disk, limiting it to recorded dirty sectors in + bitmap. + + This feature can be added to existing MD array or MD array can be + created with bitmap via mdadm(8). + + If unsure, say Y. + +config MD_LLBITMAP + bool "MD RAID lockless bitmap support" + depends on BLK_DEV_MD + help + If you say Y here, support for the lockless write intent bitmap will + be enabled. + + Note, this is an experimental feature. + + If unsure, say N. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y @@ -54,6 +80,7 @@ config MD_AUTODETECT config MD_BITMAP_FILE bool "MD bitmap file support (deprecated)" default y + depends on MD_BITMAP help If you say Y here, support for write intent bitmaps in files on an external file system is enabled. This is an alternative to the internal @@ -174,6 +201,7 @@ config MD_RAID456 config MD_CLUSTER tristate "Cluster Support for MD" + select MD_BITMAP depends on BLK_DEV_MD depends on DLM default n @@ -393,6 +421,7 @@ config DM_RAID select MD_RAID1 select MD_RAID10 select MD_RAID456 + select MD_BITMAP select BLK_DEV_MD help A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 87bdfc9fe14c..5a51b3408b70 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o -md-mod-y += md.o md-bitmap.o +md-mod-y += md.o +md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o +md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o linear-y += md-linear.o diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..8f3a23f4b168 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index cf17fd46e255..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index f4b904e24328..0a1788fed68c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3955,9 +3955,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { struct mddev *mddev = &rs->md; - r = mddev->bitmap_ops->load(mddev); - if (r) - DMERR("Failed to load bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->load(mddev); + if (r) + DMERR("Failed to load bitmap"); + } } return r; @@ -4072,10 +4074,12 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize, false); - if (r) - DMERR("Failed to resize bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize); + if (r) + DMERR("Failed to resize bitmap"); + } } /* Check for any resize/reshape on @rs and adjust/initiate */ diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a44e8c2dccee..7bd6fa05b00a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 334b71404930..84b7e2af6dba 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -34,15 +34,6 @@ #include "md-bitmap.h" #include "md-cluster.h" -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - /* * in-memory bitmap: * @@ -224,6 +215,8 @@ struct bitmap { int cluster_slot; }; +static struct workqueue_struct *md_bitmap_wq; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -static bool bitmap_enabled(struct mddev *mddev) +static bool bitmap_enabled(void *data, bool flush) { - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap = data; - if (!bitmap) - return false; + if (!flush) + return true; - return __bitmap_enabled(bitmap); + /* + * If caller want to flush bitmap pages to underlying disks, check if + * there are cached pages in filemap. + */ + return !test_bit(BITMAP_STALE, &bitmap->flags) && + bitmap->storage.filemap != NULL; } /* @@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } - md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); + md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), + page, 0); return 0; } @@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap)) + if (!bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be @@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - bool rv; + bool rv = false; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return true; /* always resync if no bitmap */ - } spin_lock_irq(&bitmap->counts.lock); - - rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ @@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, bitmap_counter_t *bmc; unsigned long flags; - if (bitmap == NULL) { - *blocks = 1024; - return; - } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) @@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev) struct bitmap *bitmap = mddev->bitmap; int bw; - if (!bitmap) - return; - atomic_inc(&bitmap->behind_writes); bw = atomic_read(&bitmap->behind_writes); if (bw > bitmap->behind_writes_used) @@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; - if (!bitmap) - return; - if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); pr_debug("dec write-behind count %d/%lu\n", @@ -2593,15 +2570,14 @@ err: return ret; } -static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - return __bitmap_resize(bitmap, blocks, chunksize, init); + return __bitmap_resize(bitmap, blocks, chunksize, false); } static ssize_t @@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -const struct attribute_group md_bitmap_group = { + +static struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; static struct bitmap_operations bitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_BITMAP, + .name = "bitmap", + }, + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, @@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = { .start_write = bitmap_start_write, .end_write = bitmap_end_write, + .start_discard = bitmap_start_write, + .end_discard = bitmap_end_write, + .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, @@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = { .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, + + .group = &md_bitmap_group, }; -void mddev_set_bitmap_ops(struct mddev *mddev) +int md_bitmap_init(void) +{ + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + return -ENOMEM; + + return register_md_submodule(&bitmap_ops.head); +} + +void md_bitmap_exit(void) { - mddev->bitmap_ops = &bitmap_ops; + destroy_workqueue(md_bitmap_wq); + unregister_md_submodule(&bitmap_ops.head); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 59e9dd45cfde..b42a28fa83a0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,10 +9,26 @@ #define BITMAP_MAGIC 0x6d746962 +/* + * version 3 is host-endian order, this is deprecated and not used for new + * array + */ +#define BITMAP_MAJOR_LO 3 +#define BITMAP_MAJOR_HOSTENDIAN 3 +/* version 4 is little-endian order, the default value */ +#define BITMAP_MAJOR_HI 4 +/* version 5 is only used for cluster */ +#define BITMAP_MAJOR_CLUSTERED 5 +/* version 6 is only used for lockless bitmap */ +#define BITMAP_MAJOR_LOCKLESS 6 + /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; @@ -61,11 +77,15 @@ struct md_bitmap_stats { struct file *file; }; +typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + struct bitmap_operations { - bool (*enabled)(struct mddev *mddev); + struct md_submodule_head head; + + bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); - int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); @@ -80,10 +100,13 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - void (*start_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); - void (*end_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); + md_bitmap_fn *start_write; + md_bitmap_fn *end_write; + md_bitmap_fn *start_discard; + md_bitmap_fn *end_discard; + + sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); + bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); @@ -101,9 +124,75 @@ struct bitmap_operations { sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); + + struct attribute_group *group; }; /* the bitmap API */ -void mddev_set_bitmap_ops(struct mddev *mddev); +static inline bool md_bitmap_registered(struct mddev *mddev) +{ + return mddev->bitmap_ops != NULL; +} + +static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) +{ + /* bitmap_ops must be registered before creating bitmap. */ + if (!md_bitmap_registered(mddev)) + return false; + + if (!mddev->bitmap) + return false; + + return mddev->bitmap_ops->enabled(mddev->bitmap, flush); +} + +static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + /* always resync if no bitmap */ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return true; + } + + return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); +} + +static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return; + } + + mddev->bitmap_ops->end_sync(mddev, offset, blocks); +} + +#ifdef CONFIG_MD_BITMAP +int md_bitmap_init(void); +void md_bitmap_exit(void); +#else +static inline int md_bitmap_init(void) +{ + return 0; +} +static inline void md_bitmap_exit(void) +{ +} +#endif + +#ifdef CONFIG_MD_LLBITMAP +int md_llbitmap_init(void); +void md_llbitmap_exit(void); +#else +static inline int md_llbitmap_init(void) +{ + return 0; +} +static inline void md_llbitmap_exit(void) +{ +} +#endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 6e9a0045f0ff..11f1e91d387d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) ret = mddev->bitmap_ops->resize(mddev, le64_to_cpu(msg->high), - 0, false); + 0); break; default: ret = -1; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 3e1f165c2d20..7033d982d377 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -257,18 +257,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, + &mddev->bio_set); + if (!bio) return true; - } - - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; } md_account_bio(mddev, &bio); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c new file mode 100644 index 000000000000..1eb434306162 --- /dev/null +++ b/drivers/md/md-llbitmap.c @@ -0,0 +1,1626 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <trace/events/block.h> + +#include "md.h" +#include "md-bitmap.h" + +/* + * #### Background + * + * Redundant data is used to enhance data fault tolerance, and the storage + * methods for redundant data vary depending on the RAID levels. And it's + * important to maintain the consistency of redundant data. + * + * Bitmap is used to record which data blocks have been synchronized and which + * ones need to be resynchronized or recovered. Each bit in the bitmap + * represents a segment of data in the array. When a bit is set, it indicates + * that the multiple redundant copies of that data segment may not be + * consistent. Data synchronization can be performed based on the bitmap after + * power failure or readding a disk. If there is no bitmap, a full disk + * synchronization is required. + * + * #### Key Features + * + * - IO fastpath is lockless, if user issues lots of write IO to the same + * bitmap bit in a short time, only the first write has additional overhead + * to update bitmap bit, no additional overhead for the following writes; + * - support only resync or recover written data, means in the case creating + * new array or replacing with a new disk, there is no need to do a full disk + * resync/recovery; + * + * #### Key Concept + * + * ##### State Machine + * + * Each bit is one byte, contain 6 different states, see llbitmap_state. And + * there are total 8 different actions, see llbitmap_action, can change state: + * + * llbitmap state machine: transitions between states + * + * | | Startwrite | Startsync | Endsync | Abortsync| + * | --------- | ---------- | --------- | ------- | ------- | + * | Unwritten | Dirty | x | x | x | + * | Clean | Dirty | x | x | x | + * | Dirty | x | x | x | x | + * | NeedSync | x | Syncing | x | x | + * | Syncing | x | Syncing | Dirty | NeedSync | + * + * | | Reload | Daemon | Discard | Stale | + * | --------- | -------- | ------ | --------- | --------- | + * | Unwritten | x | x | x | x | + * | Clean | x | x | Unwritten | NeedSync | + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | + * | NeedSync | x | x | Unwritten | x | + * | Syncing | NeedSync | x | Unwritten | NeedSync | + * + * Typical scenarios: + * + * 1) Create new array + * All bits will be set to Unwritten by default, if --assume-clean is set, + * all bits will be set to Clean instead. + * + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and + * rely on xor data + * + * 2.1) write new data to raid1/raid10: + * Unwritten --StartWrite--> Dirty + * + * 2.2) write new data to raid456: + * Unwritten --StartWrite--> NeedSync + * + * Because the initial recover for raid456 is skipped, the xor data is not built + * yet, the bit must be set to NeedSync first and after lazy initial recover is + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); + * + * 2.3) cover write + * Clean --StartWrite--> Dirty + * + * 3) daemon, if the array is not degraded: + * Dirty --Daemon--> Clean + * + * 4) discard + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten + * + * 5) resync and recover + * + * 5.1) common process + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean + * + * 5.2) resync after power failure + * Dirty --Reload--> NeedSync + * + * 5.3) recover while replacing with a new disk + * By default, the old bitmap framework will recover all data, and llbitmap + * implements this by a new helper, see llbitmap_skip_sync_blocks: + * + * skip recover for bits other than dirty or clean; + * + * 5.4) lazy initial recover for raid5: + * By default, the old bitmap framework will only allow new recover when there + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added + * to perform raid456 lazy recover for set bits(from 2.2). + * + * 6. special handling for degraded array: + * + * - Dirty bits will never be cleared, daemon will just do nothing, so that if + * a disk is readded, Clean bits can be skipped with recovery; + * - Dirty bits will convert to Syncing from start write, to do data recovery + * for new added disks; + * - New write will convert bits to NeedSync directly; + * + * ##### Bitmap IO + * + * ##### Chunksize + * + * The default bitmap size is 128k, incluing 1k bitmap super block, and + * the default size of segment of data in the array each bit(chunksize) is 64k, + * and chunksize will adjust to twice the old size each time if the total number + * bits is not less than 127k.(see llbitmap_init) + * + * ##### READ + * + * While creating bitmap, all pages will be allocated and read for llbitmap, + * there won't be read afterwards + * + * ##### WRITE + * + * WRITE IO is divided into logical_block_size of the array, the dirty state + * of each block is tracked independently, for example: + * + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; + * + * | page0 | page1 | ... | page 31 | + * | | + * | \-----------------------\ + * | | + * | block0 | block1 | ... | block 8| + * | | + * | \-----------------\ + * | | + * | bit0 | bit1 | ... | bit511 | + * + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding + * subpage will be marked dirty, such block must write first before the IO is + * issued. This behaviour will affect IO performance, to reduce the impact, if + * multiple bits are changed in the same block in a short time, all bits in this + * block will be changed to Dirty/NeedSync, so that there won't be any overhead + * until daemon clears dirty bits. + * + * ##### Dirty Bits synchronization + * + * IO fast path will set bits to dirty, and those dirty bits will be cleared + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between + * IO path and daemon; + * + * IO path: + * 1) try to grab a reference, if succeed, set expire time after 5s and return; + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty + * bits; + * + * Daemon (Daemon will be woken up every daemon_sleep seconds): + * For each page: + * 1) check if page expired, if not skip this page; for expired page: + * 2) suspend the page and wait for inflight write IO to be done; + * 3) change dirty page to clean; + * 4) resume the page; + */ + +#define BITMAP_DATA_OFFSET 1024 + +/* 64k is the max IO size of sync IO for raid1/raid10 */ +#define MIN_CHUNK_SIZE (64 * 2) + +/* By default, daemon will be woken up every 30s */ +#define DEFAULT_DAEMON_SLEEP 30 + +/* + * Dirtied bits that have not been accessed for more than 5s will be cleared + * by daemon. + */ +#define DEFAULT_BARRIER_IDLE 5 + +enum llbitmap_state { + /* No valid data, init state after assemble the array */ + BitUnwritten = 0, + /* data is consistent */ + BitClean, + /* data will be consistent after IO is done, set directly for writes */ + BitDirty, + /* + * data need to be resynchronized: + * 1) set directly for writes if array is degraded, prevent full disk + * synchronization after readding a disk; + * 2) reassemble the array after power failure, and dirty bits are + * found after reloading the bitmap; + * 3) set for first write for raid5, to build initial xor data lazily + */ + BitNeedSync, + /* data is synchronizing */ + BitSyncing, + BitStateCount, + BitNone = 0xff, +}; + +enum llbitmap_action { + /* User write new data, this is the only action from IO fast path */ + BitmapActionStartwrite = 0, + /* Start recovery */ + BitmapActionStartsync, + /* Finish recovery */ + BitmapActionEndsync, + /* Failed recovery */ + BitmapActionAbortsync, + /* Reassemble the array */ + BitmapActionReload, + /* Daemon thread is trying to clear dirty bits */ + BitmapActionDaemon, + /* Data is deleted */ + BitmapActionDiscard, + /* + * Bitmap is stale, mark all bits in addition to BitUnwritten to + * BitNeedSync. + */ + BitmapActionStale, + BitmapActionCount, + /* Init state is BitUnwritten */ + BitmapActionInit, +}; + +enum llbitmap_page_state { + LLPageFlush = 0, + LLPageDirty, +}; + +struct llbitmap_page_ctl { + char *state; + struct page *page; + unsigned long expire; + unsigned long flags; + wait_queue_head_t wait; + struct percpu_ref active; + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ + unsigned long dirty[]; +}; + +struct llbitmap { + struct mddev *mddev; + struct llbitmap_page_ctl **pctl; + + unsigned int nr_pages; + unsigned int io_size; + unsigned int blocks_per_page; + + /* shift of one chunk */ + unsigned long chunkshift; + /* size of one chunk in sector */ + unsigned long chunksize; + /* total number of chunks */ + unsigned long chunks; + unsigned long last_end_sync; + /* + * time in seconds that dirty bits will be cleared if the page is not + * accessed. + */ + unsigned long barrier_idle; + /* fires on first BitDirty state */ + struct timer_list pending_timer; + struct work_struct daemon_work; + + unsigned long flags; + __u64 events_cleared; + + /* for slow disks */ + atomic_t behind_writes; + wait_queue_head_t behind_wait; +}; + +struct llbitmap_unplug_work { + struct work_struct work; + struct llbitmap *llbitmap; + struct completion *done; +}; + +static struct workqueue_struct *md_llbitmap_io_wq; +static struct workqueue_struct *md_llbitmap_unplug_wq; + +static char state_machine[BitStateCount][BitmapActionCount] = { + [BitUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitNone, + [BitmapActionStale] = BitNone, + }, + [BitClean] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitDirty] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitClean, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitNeedSync] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNone, + }, + [BitSyncing] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitDirty, + [BitmapActionAbortsync] = BitNeedSync, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, +}; + +static void __llbitmap_flush(struct mddev *mddev); + +static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) +{ + unsigned int idx; + unsigned int offset; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + offset = offset_in_page(pos); + + return llbitmap->pctl[idx]->state[offset]; +} + +/* set all the bits in the subpage as dirty */ +static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, + struct llbitmap_page_ctl *pctl, + unsigned int block) +{ + bool level_456 = raid_is_456(llbitmap->mddev); + unsigned int io_size = llbitmap->io_size; + int pos; + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + switch (pctl->state[pos]) { + case BitUnwritten: + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; + break; + case BitClean: + pctl->state[pos] = BitDirty; + break; + }; + } +} + +static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, + int offset) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + unsigned int io_size = llbitmap->io_size; + int block = offset / io_size; + int pos; + + if (!test_bit(LLPageDirty, &pctl->flags)) + set_bit(LLPageDirty, &pctl->flags); + + /* + * For degraded array, dirty bits will never be cleared, and we must + * resync all the dirty bits, hence skip infect new dirty bits to + * prevent resync unnecessary data. + */ + if (llbitmap->mddev->degraded) { + set_bit(block, pctl->dirty); + return; + } + + /* + * The subpage usually contains a total of 512 bits. If any single bit + * within the subpage is marked as dirty, the entire sector will be + * written. To avoid impacting write performance, when multiple bits + * within the same sector are modified within llbitmap->barrier_idle, + * all bits in the sector will be collectively marked as dirty at once. + */ + if (test_and_set_bit(block, pctl->dirty)) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + if (pos == offset) + continue; + if (pctl->state[pos] == BitDirty || + pctl->state[pos] == BitNeedSync) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + } +} + +static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, + loff_t pos) +{ + unsigned int idx; + unsigned int bit; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + bit = offset_in_page(pos); + + llbitmap->pctl[idx]->state[bit] = state; + if (state == BitDirty || state == BitNeedSync) + llbitmap_set_page_dirty(llbitmap, idx, bit); +} + +static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) +{ + struct mddev *mddev = llbitmap->mddev; + struct page *page = NULL; + struct md_rdev *rdev; + + if (llbitmap->pctl && llbitmap->pctl[idx]) + page = llbitmap->pctl[idx]->page; + if (page) + return page; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return ERR_PTR(-ENOMEM); + + rdev_for_each(rdev, mddev) { + sector_t sector; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + + (idx << PAGE_SECTORS_SHIFT); + + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, + true)) + return page; + + md_error(mddev, rdev); + } + + __free_page(page); + return ERR_PTR(-EIO); +} + +static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) +{ + struct page *page = llbitmap->pctl[idx]->page; + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + int block; + + for (block = 0; block < llbitmap->blocks_per_page; block++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (!test_and_clear_bit(block, pctl->dirty)) + continue; + + rdev_for_each(rdev, mddev) { + sector_t sector; + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + rdev->sb_start + + (idx << PAGE_SECTORS_SHIFT) + + block * bit_sector; + md_write_metadata(mddev, rdev, sector, + llbitmap->io_size, page, + block * llbitmap->io_size); + } + } +} + +static void active_release(struct percpu_ref *ref) +{ + struct llbitmap_page_ctl *pctl = + container_of(ref, struct llbitmap_page_ctl, active); + + wake_up(&pctl->wait); +} + +static void llbitmap_free_pages(struct llbitmap *llbitmap) +{ + int i; + + if (!llbitmap->pctl) + return; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + if (!pctl || !pctl->page) + break; + + __free_page(pctl->page); + percpu_ref_exit(&pctl->active); + } + + kfree(llbitmap->pctl[0]); + kfree(llbitmap->pctl); + llbitmap->pctl = NULL; +} + +static int llbitmap_cache_pages(struct llbitmap *llbitmap) +{ + struct llbitmap_page_ctl *pctl; + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + + BITMAP_DATA_OFFSET, PAGE_SIZE); + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( + llbitmap->blocks_per_page)); + int i; + + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!llbitmap->pctl) + return -ENOMEM; + + size = round_up(size, cache_line_size()); + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); + if (!pctl) { + kfree(llbitmap->pctl); + return -ENOMEM; + } + + llbitmap->nr_pages = nr_pages; + + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { + struct page *page = llbitmap_read_page(llbitmap, i); + + llbitmap->pctl[i] = pctl; + + if (IS_ERR(page)) { + llbitmap_free_pages(llbitmap); + return PTR_ERR(page); + } + + if (percpu_ref_init(&pctl->active, active_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + __free_page(page); + llbitmap_free_pages(llbitmap); + return -ENOMEM; + } + + pctl->page = page; + pctl->state = page_address(page); + init_waitqueue_head(&pctl->wait); + } + + return 0; +} + +static void llbitmap_init_state(struct llbitmap *llbitmap) +{ + enum llbitmap_state state = BitUnwritten; + unsigned long i; + + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + state = BitClean; + + for (i = 0; i < llbitmap->chunks; i++) + llbitmap_write(llbitmap, state, i); +} + +/* The return value is only used from resync, where @start == @end. */ +static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, + unsigned long start, + unsigned long end, + enum llbitmap_action action) +{ + struct mddev *mddev = llbitmap->mddev; + enum llbitmap_state state = BitNone; + bool level_456 = raid_is_456(llbitmap->mddev); + bool need_resync = false; + bool need_recovery = false; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return BitNone; + + if (action == BitmapActionInit) { + llbitmap_init_state(llbitmap); + return BitNone; + } + + while (start <= end) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) { + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", + __func__, start, c, action); + state = BitNeedSync; + goto write_bitmap; + } + + if (c == BitNeedSync) + need_resync = !mddev->degraded; + + state = state_machine[c][action]; + +write_bitmap: + if (unlikely(mddev->degraded)) { + /* For degraded array, mark new data as need sync. */ + if (state == BitDirty && + action == BitmapActionStartwrite) + state = BitNeedSync; + /* + * For degraded array, resync dirty data as well, noted + * if array is still degraded after resync is done, all + * new data will still be dirty until array is clean. + */ + else if (c == BitDirty && + action == BitmapActionStartsync) + state = BitSyncing; + } else if (c == BitUnwritten && state == BitDirty && + action == BitmapActionStartwrite && level_456) { + /* Delay raid456 initial recovery to first write. */ + state = BitNeedSync; + } + + if (state == BitNone) { + start++; + continue; + } + + llbitmap_write(llbitmap, state, start); + + if (state == BitNeedSync) + need_resync = !mddev->degraded; + else if (state == BitDirty && + !timer_pending(&llbitmap->pending_timer)) + mod_timer(&llbitmap->pending_timer, + jiffies + mddev->bitmap_info.daemon_sleep * HZ); + + start++; + } + + if (need_resync && level_456) + need_recovery = true; + + if (need_recovery) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else if (need_resync) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + + return state; +} + +static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + +retry: + if (likely(percpu_ref_tryget_live(&pctl->active))) { + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); + return; + } + + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); + goto retry; +} + +static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_put(&pctl->active); +} + +static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_kill(&pctl->active); + + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + return -ETIMEDOUT; + + return 0; +} + +static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + pctl->expire = LONG_MAX; + percpu_ref_resurrect(&pctl->active); + wake_up(&pctl->wait); +} + +static int llbitmap_check_support(struct mddev *mddev) +{ + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", + mdname(mddev)); + return -EBUSY; + } + + if (mddev->bitmap_info.space == 0) { + if (mddev->bitmap_info.default_space == 0) { + pr_notice("md/llbitmap: %s: no space for bitmap\n", + mdname(mddev)); + return -ENOSPC; + } + } + + if (!mddev->persistent) { + pr_notice("md/llbitmap: %s: array must be persistent\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.file) { + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.external) { + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev_is_dm(mddev)) { + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int llbitmap_init(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunksize = MIN_CHUNK_SIZE; + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; + int ret; + + while (chunks > space) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; + + ret = llbitmap_cache_pages(llbitmap); + if (ret) + return ret; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionInit); + /* flush initial llbitmap to disk */ + __llbitmap_flush(mddev); + + return 0; +} + +static int llbitmap_read_sb(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + unsigned long daemon_sleep; + unsigned long chunksize; + unsigned long events; + struct page *sb_page; + bitmap_super_t *sb; + int ret = -EINVAL; + + if (!mddev->bitmap_info.offset) { + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); + return -EINVAL; + } + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("md/llbitmap: %s: read super block failed", + mdname(mddev)); + return -EIO; + } + + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_err("md/llbitmap: %s: invalid super block magic number", + mdname(mddev)); + goto out_put_page; + } + + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { + pr_err("md/llbitmap: %s: invalid super block version", + mdname(mddev)); + goto out_put_page; + } + + if (memcmp(sb->uuid, mddev->uuid, 16)) { + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", + mdname(mddev)); + goto out_put_page; + } + + if (mddev->bitmap_info.space == 0) { + int room = le32_to_cpu(sb->sectors_reserved); + + if (room) + mddev->bitmap_info.space = room; + else + mddev->bitmap_info.space = mddev->bitmap_info.default_space; + } + llbitmap->flags = le32_to_cpu(sb->state); + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { + ret = llbitmap_init(llbitmap); + goto out_put_page; + } + + chunksize = le32_to_cpu(sb->chunksize); + if (!is_power_of_2(chunksize)) { + pr_err("md/llbitmap: %s: chunksize not a power of 2", + mdname(mddev)); + goto out_put_page; + } + + if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", + mdname(mddev), chunksize, mddev->resync_max_sectors, + mddev->bitmap_info.space); + goto out_put_page; + } + + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", + mdname(mddev), daemon_sleep); + goto out_put_page; + } + + events = le64_to_cpu(sb->events); + if (events < mddev->events) { + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", + mdname(mddev), events, mddev->events); + set_bit(BITMAP_STALE, &llbitmap->flags); + } + + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + mddev->bitmap_info.chunksize = chunksize; + mddev->bitmap_info.daemon_sleep = daemon_sleep; + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunksize = chunksize; + llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); + llbitmap->chunkshift = ffz(~chunksize); + ret = llbitmap_cache_pages(llbitmap); + +out_put_page: + __free_page(sb_page); + kunmap_local(sb); + return ret; +} + +static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) +{ + struct llbitmap *llbitmap = + container_of(pending_timer, struct llbitmap, pending_timer); + + if (work_busy(&llbitmap->daemon_work)) { + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", + mdname(llbitmap->mddev), + llbitmap->mddev->bitmap_info.daemon_sleep); + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); + return; + } + + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); +} + +static void md_llbitmap_daemon_fn(struct work_struct *work) +{ + struct llbitmap *llbitmap = + container_of(work, struct llbitmap, daemon_work); + unsigned long start; + unsigned long end; + bool restart; + int idx; + + if (llbitmap->mddev->degraded) + return; +retry: + start = 0; + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; + restart = false; + + for (idx = 0; idx < llbitmap->nr_pages; idx++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (idx > 0) { + start = end + 1; + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); + } + + if (!test_bit(LLPageFlush, &pctl->flags) && + time_before(jiffies, pctl->expire)) { + restart = true; + continue; + } + + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", + mdname(llbitmap->mddev), __func__, idx); + continue; + } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); + llbitmap_resume(llbitmap, idx); + } + + /* + * If the daemon took a long time to finish, retry to prevent missing + * clearing dirty bits. + */ + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) + goto retry; + + /* If some page is dirty but not expired, setup timer again */ + if (restart) + mod_timer(&llbitmap->pending_timer, + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); +} + +static int llbitmap_create(struct mddev *mddev) +{ + struct llbitmap *llbitmap; + int ret; + + ret = llbitmap_check_support(mddev); + if (ret) + return ret; + + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); + if (!llbitmap) + return -ENOMEM; + + llbitmap->mddev = mddev; + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; + + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); + atomic_set(&llbitmap->behind_writes, 0); + init_waitqueue_head(&llbitmap->behind_wait); + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = llbitmap; + ret = llbitmap_read_sb(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); + if (ret) { + kfree(llbitmap); + mddev->bitmap = NULL; + } + + return ret; +} + +static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long chunks; + + if (chunksize == 0) + chunksize = llbitmap->chunksize; + + /* If there is enough space, leave the chunksize unchanged. */ + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + + return 0; +} + +static int llbitmap_load(struct mddev *mddev) +{ + enum llbitmap_action action = BitmapActionReload; + struct llbitmap *llbitmap = mddev->bitmap; + + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) + action = BitmapActionStale; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); + return 0; +} + +static void llbitmap_destroy(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + mutex_lock(&mddev->bitmap_info.mutex); + + timer_delete_sync(&llbitmap->pending_timer); + flush_workqueue(md_llbitmap_io_wq); + flush_workqueue(md_llbitmap_unplug_wq); + + mddev->bitmap = NULL; + llbitmap_free_pages(llbitmap); + kfree(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static void llbitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_unplug_fn(struct work_struct *work) +{ + struct llbitmap_unplug_work *unplug_work = + container_of(work, struct llbitmap_unplug_work, work); + struct llbitmap *llbitmap = unplug_work->llbitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + + for (i = 0; i < llbitmap->nr_pages; i++) { + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + continue; + + llbitmap_write_page(llbitmap, i); + } + + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); + complete(unplug_work->done); +} + +static bool llbitmap_dirty(struct llbitmap *llbitmap) +{ + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + return true; + + return false; +} + +static void llbitmap_unplug(struct mddev *mddev, bool sync) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct llbitmap *llbitmap = mddev->bitmap; + struct llbitmap_unplug_work unplug_work = { + .llbitmap = llbitmap, + .done = &done, + }; + + if (!llbitmap_dirty(llbitmap)) + return; + + /* + * Issue new bitmap IO under submit_bio() context will deadlock: + * - the bio will wait for bitmap bio to be done, before it can be + * issued; + * - bitmap bio will be added to current->bio_list and wait for this + * bio to be issued; + */ + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); + wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); +} + +/* + * Force to write all bitmap pages to disk, called when stopping the array, or + * every daemon_sleep seconds when sync_thread is running. + */ +static void __llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* mark all blocks as dirty */ + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + llbitmap_write_page(llbitmap, i); + } + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); + + timer_delete_sync(&llbitmap->pending_timer); + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); + flush_work(&llbitmap->daemon_work); + + __llbitmap_flush(mddev); +} + +/* This is used for raid5 lazy initial recovery */ +static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + return c == BitClean || c == BitDirty; +} + +static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + /* always skip unwritten blocks */ + if (c == BitUnwritten) + return blocks; + + /* For degraded array, don't skip */ + if (mddev->degraded) + return 0; + + /* For resync also skip clean/dirty blocks */ + if ((c == BitClean || c == BitDirty) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return blocks; + + return 0; +} + +static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + /* + * Handle one bit at a time, this is much simpler. And it doesn't matter + * if md_do_sync() loop more times. + */ + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + return llbitmap_state_machine(llbitmap, p, p, + BitmapActionStartsync) == BitSyncing; +} + +/* Something is wrong, sync_thread stop at @offset */ +static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, + BitmapActionAbortsync); +} + +/* A full sync_thread is finished */ +static void llbitmap_close_sync(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* let daemon_fn clear dirty bits immediately */ + WRITE_ONCE(pctl->expire, jiffies); + } + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionEndsync); +} + +/* + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, + * just in case sync_thread have to restart after power failure. + */ +static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (sector == 0) { + llbitmap->last_end_sync = jiffies; + return; + } + + if (time_before(jiffies, llbitmap->last_end_sync + + HZ * mddev->bitmap_info.daemon_sleep)) + return; + + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, + BitmapActionEndsync); + __llbitmap_flush(mddev); + + llbitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(mddev->sysfs_completed); +} + +static bool llbitmap_enabled(void *data, bool flush) +{ + struct llbitmap *llbitmap = data; + + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); +} + +static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) +{ + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); +} + +static void llbitmap_write_sb(struct llbitmap *llbitmap) +{ + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); + + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); + llbitmap_write_page(llbitmap, 0); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_update_sb(void *data) +{ + struct llbitmap *llbitmap = data; + struct mddev *mddev = llbitmap->mddev; + struct page *sb_page; + bitmap_super_t *sb; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return; + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("%s: %s: read super block failed", __func__, + mdname(mddev)); + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); + return; + } + + if (mddev->events < llbitmap->events_cleared) + llbitmap->events_cleared = mddev->events; + + sb = kmap_local_page(sb_page); + sb->events = cpu_to_le64(mddev->events); + sb->state = cpu_to_le32(llbitmap->flags); + sb->chunksize = cpu_to_le32(llbitmap->chunksize); + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); + + kunmap_local(sb); + llbitmap_write_sb(llbitmap); +} + +static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct llbitmap *llbitmap = data; + + memset(stats, 0, sizeof(*stats)); + + stats->missing_pages = 0; + stats->pages = llbitmap->nr_pages; + stats->file_pages = llbitmap->nr_pages; + + stats->behind_writes = atomic_read(&llbitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); + stats->events_cleared = llbitmap->events_cleared; + + return 0; +} + +/* just flag all pages as needing to be written */ +static void llbitmap_write_all(struct mddev *mddev) +{ + int i; + struct llbitmap *llbitmap = mddev->bitmap; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + } +} + +static void llbitmap_start_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + atomic_inc(&llbitmap->behind_writes); +} + +static void llbitmap_end_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (atomic_dec_and_test(&llbitmap->behind_writes)) + wake_up(&llbitmap->behind_wait); +} + +static void llbitmap_wait_behind_writes(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + wait_event(llbitmap->behind_wait, + atomic_read(&llbitmap->behind_writes) == 0); + +} + +static ssize_t bits_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + int bits[BitStateCount] = {0}; + loff_t start = 0; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "bitmap io error\n"); + } + + while (start < llbitmap->chunks) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) + pr_err("%s: invalid bit %llu state %d\n", + __func__, start, c); + else + bits[c]++; + start++; + } + + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + bits[BitUnwritten], bits[BitClean], bits[BitDirty], + bits[BitNeedSync], bits[BitSyncing]); +} + +static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + ssize_t ret; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", + llbitmap->chunksize, llbitmap->chunkshift, + llbitmap->chunks, mddev->bitmap_info.offset, + llbitmap->mddev->bitmap_info.daemon_sleep); + mutex_unlock(&mddev->bitmap_info.mutex); + + return ret; +} + +static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); + +static ssize_t +daemon_sleep_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); +} + +static ssize_t +daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + mddev->bitmap_info.daemon_sleep = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); + +static ssize_t +barrier_idle_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + return sprintf(page, "%lu\n", llbitmap->barrier_idle); +} + +static ssize_t +barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + llbitmap->barrier_idle = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); + +static struct attribute *md_llbitmap_attrs[] = { + &llbitmap_bits.attr, + &llbitmap_metadata.attr, + &llbitmap_daemon_sleep.attr, + &llbitmap_barrier_idle.attr, + NULL +}; + +static struct attribute_group md_llbitmap_group = { + .name = "llbitmap", + .attrs = md_llbitmap_attrs, +}; + +static struct bitmap_operations llbitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_LLBITMAP, + .name = "llbitmap", + }, + + .enabled = llbitmap_enabled, + .create = llbitmap_create, + .resize = llbitmap_resize, + .load = llbitmap_load, + .destroy = llbitmap_destroy, + + .start_write = llbitmap_start_write, + .end_write = llbitmap_end_write, + .start_discard = llbitmap_start_discard, + .end_discard = llbitmap_end_discard, + .unplug = llbitmap_unplug, + .flush = llbitmap_flush, + + .start_behind_write = llbitmap_start_behind_write, + .end_behind_write = llbitmap_end_behind_write, + .wait_behind_writes = llbitmap_wait_behind_writes, + + .blocks_synced = llbitmap_blocks_synced, + .skip_sync_blocks = llbitmap_skip_sync_blocks, + .start_sync = llbitmap_start_sync, + .end_sync = llbitmap_end_sync, + .close_sync = llbitmap_close_sync, + .cond_end_sync = llbitmap_cond_end_sync, + + .update_sb = llbitmap_update_sb, + .get_stats = llbitmap_get_stats, + .dirty_bits = llbitmap_dirty_bits, + .write_all = llbitmap_write_all, + + .group = &md_llbitmap_group, +}; + +int md_llbitmap_init(void) +{ + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_io_wq) + return -ENOMEM; + + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_unplug_wq) { + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + return -ENOMEM; + } + + return register_md_submodule(&llbitmap_ops.head); +} + +void md_llbitmap_exit(void) +{ + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + destroy_workqueue(md_llbitmap_unplug_wq); + md_llbitmap_unplug_wq = NULL; + unregister_md_submodule(&llbitmap_ops.head); +} diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e033c26fdd4..41c476b40c7a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -677,8 +676,64 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } + return true; + +err: + xa_unlock(&md_submodule); + return false; +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) @@ -713,7 +768,6 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -1020,15 +1074,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -1046,7 +1111,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1356,6 +1421,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -1653,8 +1721,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -2302,8 +2370,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2313,13 +2381,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2331,8 +2401,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2804,24 +2873,24 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } @@ -4150,6 +4219,86 @@ static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4680,6 +4829,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -5752,6 +5904,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5801,7 +5954,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -6133,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6299,7 +6471,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6372,7 +6544,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; put_pers(pers); - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6392,10 +6564,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6546,7 +6720,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6573,7 +6748,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6589,7 +6765,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7307,6 +7483,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7363,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7679,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7710,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7747,9 +7926,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -8491,6 +8670,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8873,18 +9055,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_bitmap_start(struct mddev *mddev, struct md_io_clone *md_io_clone) { + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + if (mddev->pers->bitmap_sector) mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) @@ -8893,7 +9081,7 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -8920,9 +9108,10 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); - if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { md_io_clone->offset = (*bio)->bi_iter.bi_sector; md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); md_bitmap_start(mddev, md_io_clone); } @@ -8944,7 +9133,7 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -9010,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -9041,6 +9263,14 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -9061,19 +9291,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) static bool sync_io_within_limit(struct mddev *mddev) { - int io_sectors; - /* * For raid456, sync IO is stripe(4k) per IO, for other levels, it's * RESYNC_PAGES(64k) per IO. */ - if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) - io_sectors = 8; - else - io_sectors = 128; - return atomic_read(&mddev->recovery_active) < - io_sectors * sync_io_depth(mddev); + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } #define SYNC_MARKS 10 @@ -9283,6 +9506,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9298,6 +9527,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ @@ -9607,6 +9837,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9615,6 +9846,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9624,7 +9856,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -9682,7 +9914,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9770,7 +10002,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -9837,6 +10069,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9947,6 +10180,7 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, @@ -10094,8 +10328,16 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + + if (ret) + return ret; + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -10104,11 +10346,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -10127,12 +10364,13 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } @@ -10150,7 +10388,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } @@ -10438,8 +10676,8 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); diff --git a/drivers/md/md.h b/drivers/md/md.h index 51af29a03079..1979c2d4fe89 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -26,7 +26,7 @@ enum md_submodule_type { MD_PERSONALITY = 0, MD_CLUSTER, - MD_BITMAP, /* TODO */ + MD_BITMAP, }; enum md_submodule_id { @@ -38,8 +38,9 @@ enum md_submodule_id { ID_RAID6 = 6, ID_RAID10 = 10, ID_CLUSTER, - ID_BITMAP, /* TODO */ - ID_LLBITMAP, /* TODO */ + ID_BITMAP, + ID_LLBITMAP, + ID_BITMAP_NONE, }; struct md_submodule_head { @@ -565,6 +566,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ + enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { @@ -665,6 +667,8 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, + /* raid456 lazy initial recover */ + MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { @@ -796,7 +800,6 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { @@ -873,6 +876,7 @@ struct md_io_clone { unsigned long start_time; sector_t offset; unsigned long sectors; + enum stat_group rw; struct bio bio_clone; }; @@ -909,8 +913,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); @@ -1013,7 +1018,6 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; -extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); @@ -1034,6 +1038,12 @@ static inline bool mddev_is_dm(struct mddev *mddev) return !mddev->gendisk; } +static inline bool raid_is_456(struct mddev *mddev) +{ + return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || + mddev->level == ID_RAID6; +} + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 419139ad7663..e443e478645a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -464,21 +464,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, + zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) @@ -613,17 +608,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - bio_chain(split, bio); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 52881e6032da..521625756128 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev)) { + if (!md_bitmap_enabled(mddev, true)) { raid1_submit_write(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d30b82beeb92..592a40233004 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* @@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1366,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags)) { + if (test_bit(WriteMostly, &mirror->rdev->flags) && + md_bitmap_enabled(mddev, false)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' @@ -1376,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1413,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1452,12 +1448,36 @@ retry: return true; } +static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio, + struct bio *bio) +{ + unsigned long max_write_behind = mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + + /* behind write rely on bitmap, see bitmap_operations */ + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + /* Don't do behind IO if reader is waiting, or there are too many. */ + if (!stats.behind_wait && stats.behind_writes < max_write_behind) + alloc_behind_master_bio(r1_bio, bio); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1561,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) @@ -1584,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1612,22 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { - unsigned long max_write_behind = - mddev->bitmap_info.max_write_behind; - struct md_bitmap_stats stats; - int err; - - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); - if (!err && write_behind && !stats.behind_wait && - stats.behind_writes < max_write_behind) - alloc_behind_master_bio(r1_bio, bio); - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) - mddev->bitmap_ops->start_behind_write(mddev); + if (write_behind) + raid1_start_write_behind(mddev, r1_bio, bio); first_clone = 0; } @@ -1683,8 +1684,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -2057,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); + md_bitmap_end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2804,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2829,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2846,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ - - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -3004,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3325,15 +3326,17 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); - int ret; if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, newsize, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index d236ef179cfb..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -178,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9832eefb2f15..14dcd5142eb4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* @@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -1154,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1203,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1241,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1351,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && mddev->cluster_ops->area_resyncing(mddev, WRITE, @@ -1465,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) @@ -1489,17 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; } @@ -1531,8 +1523,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1679,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1694,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); @@ -3221,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - mddev->bitmap_ops->end_sync(mddev, - mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - mddev->bitmap_ops->end_sync(mddev, sect, - &sync_blocks); + md_bitmap_end_sync(mddev, sect, &sync_blocks); } } else { /* completed sync */ @@ -3249,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3351,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, - true); + must_sync = md_bitmap_start_sync(mddev, sect, + &sync_blocks, true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3396,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, still_degraded); - + md_bitmap_start_sync(mddev, sect, &sync_blocks, + still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { int k; @@ -3570,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, - mddev->degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4226,7 +4217,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; - int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4240,9 +4230,12 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, size, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && @@ -4508,8 +4501,9 @@ static int raid10_start_reshape(struct mddev *mddev) oldsize = raid10_size(mddev, 0, 0); newsize = raid10_size(mddev, 0, conf->geo.raid_disks); - if (!mddev_is_clustered(mddev)) { - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (!mddev_is_clustered(mddev) && + md_bitmap_enabled(mddev, false)) { + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; else @@ -4531,13 +4525,14 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + /* cluster can't be setup without bitmap */ + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - mddev->bitmap_ops->resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0); goto abort; } } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3f16ad6904a9..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -165,6 +165,8 @@ enum r10bio_state { * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e385ef1355e8..24b32a0c95b4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t resync_offset = conf->mddev->resync_offset; + struct mddev *mddev = conf->mddev; + sector_t resync_offset = mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4116,6 +4117,12 @@ static int handle_stripe_dirtying(struct r5conf *conf, pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); + } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { + /* The initial recover is not done, must read everything */ + rcw = 1; rmw = 2; + pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", + sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", sh->sector, rmw); for (i = disks; i--; ) { @@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && !mddev_is_dm(conf->mddev)) - blk_add_trace_msg(conf->mddev->gendisk->queue, + if (rcw && !mddev_is_dm(mddev)) + blk_add_trace_msg(mddev->gendisk->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -4698,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= + rdev->recovery_offset) { + /* + * in sync if: + * - normal IO, or + * - resync IO that is not lazy recovery + * + * For lazy recovery, we have to mark the rdev without + * In_sync as failed, to build initial xor data. + */ + if (!test_bit(STRIPE_SYNCING, &sh->state) || + !test_bit(MD_RECOVERY_LAZY_RECOVER, + &conf->mddev->recovery)) + set_bit(R5_Insync, &dev->flags); + } else if (test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Expanded, &dev->flags)) /* If we've reshaped into here, we assume it is Insync. * We will shortly update recovery_offset to make @@ -5468,17 +5486,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - submit_bio_noacct(raid_bio); - raid_bio = split; + + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) @@ -6492,11 +6510,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); return 0; } @@ -6525,8 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - true) && + !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6535,7 +6553,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6557,9 +6576,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n still_degraded = true; } - mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - still_degraded); - + md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6763,7 +6780,8 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - mddev->bitmap_ops->unplug(mddev, true); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -8313,7 +8331,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; - int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8323,9 +8340,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index d34892782f6e..1af157ce0a63 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb) msb->card = NULL; } -static int msb_bd_getgeo(struct block_device *bdev, +static int msb_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct msb_data *msb = bdev->bd_disk->private_data; + struct msb_data *msb = disk->private_data; *geo = msb->geometry; return 0; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d28..075519caa547 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk) kfree(msb); } -static int mspro_block_bd_getgeo(struct block_device *bdev, +static int mspro_block_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mspro_block_data *msb = bdev->bd_disk->private_data; + struct mspro_block_data *msb = disk->private_data; geo->heads = msb->heads; geo->sectors = msb->sectors_per_track; diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index 3a64dc7a7e27..3304f8824cf7 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, * This is anyones guess quite frankly. */ int -mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, +mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h index 8c2bb2331fc1..f9678d48100c 100644 --- a/drivers/message/fusion/mptscsih.h +++ b/drivers/message/fusion/mptscsih.h @@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt); extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt); -extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]); +extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]); extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index a74e75df93b0..9399bf6c766a 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -439,9 +439,9 @@ static void mmc_blk_release(struct gendisk *disk) } static int -mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16); + geo->cylinders = get_capacity(disk) / (4 * 16); geo->heads = 4; geo->sectors = 16; return 0; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 847c11542f02..28e09d080440 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -246,9 +246,9 @@ unlock: blktrans_dev_put(dev); } -static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data; + struct mtd_blktrans_dev *dev = disk->private_data; int ret = -ENXIO; mutex_lock(&dev->lock); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 39cc0a6a4d37..b53fd147fa65 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd) mutex_unlock(&dev->dev_mutex); } -static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* Some tools might require this information */ geo->heads = 1; geo->cylinders = 1; - geo->sectors = get_capacity(bdev->bd_disk); + geo->sectors = get_capacity(disk); geo->start = 0; return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 2a1aa32e6693..a933db961ed7 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 91e273b89fea..1f51fbebd9fa 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -684,6 +684,59 @@ out_free_enc: EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); /** + * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1) + * @hmac_tfm: hash context keyed with pseudorandom key + * @label: ASCII label without "tls13 " prefix + * @labellen: length of @label + * @context: context bytes + * @contextlen: length of @context + * @okm: output keying material + * @okmlen: length of @okm + * + * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand(). + * + * Returns 0 on success with output keying material stored in @okm, + * or a negative errno value otherwise. + */ +static int hkdf_expand_label(struct crypto_shash *hmac_tfm, + const u8 *label, unsigned int labellen, + const u8 *context, unsigned int contextlen, + u8 *okm, unsigned int okmlen) +{ + int err; + u8 *info; + unsigned int infolen; + const char *tls13_prefix = "tls13 "; + unsigned int prefixlen = strlen(tls13_prefix); + + if (WARN_ON(labellen > (255 - prefixlen))) + return -EINVAL; + if (WARN_ON(contextlen > 255)) + return -EINVAL; + + infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen); + info = kzalloc(infolen, GFP_KERNEL); + if (!info) + return -ENOMEM; + + /* HkdfLabel.Length */ + put_unaligned_be16(okmlen, info); + + /* HkdfLabel.Label */ + info[2] = prefixlen + labellen; + memcpy(info + 3, tls13_prefix, prefixlen); + memcpy(info + 3 + prefixlen, label, labellen); + + /* HkdfLabel.Context */ + info[3 + prefixlen + labellen] = contextlen; + memcpy(info + 4 + prefixlen + labellen, context, contextlen); + + err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen); + kfree_sensitive(info); + return err; +} + +/** * nvme_auth_derive_tls_psk - Derive TLS PSK * @hmac_id: Hash function identifier * @psk: generated input PSK @@ -715,10 +768,10 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, { struct crypto_shash *hmac_tfm; const char *hmac_name; - const char *psk_prefix = "tls13 nvme-tls-psk"; + const char *label = "nvme-tls-psk"; static const char default_salt[HKDF_MAX_HASHLEN]; - size_t info_len, prk_len; - char *info; + size_t prk_len; + const char *ctx; unsigned char *prk, *tls_key; int ret; @@ -758,36 +811,29 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, if (ret) goto out_free_prk; - /* - * 2 additional bytes for the length field from HDKF-Expand-Label, - * 2 additional bytes for the HMAC ID, and one byte for the space - * separator. - */ - info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; - info = kzalloc(info_len + 1, GFP_KERNEL); - if (!info) { + ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest); + if (!ctx) { ret = -ENOMEM; goto out_free_prk; } - put_unaligned_be16(psk_len, info); - memcpy(info + 2, psk_prefix, strlen(psk_prefix)); - sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); - tls_key = kzalloc(psk_len, GFP_KERNEL); if (!tls_key) { ret = -ENOMEM; - goto out_free_info; + goto out_free_ctx; } - ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + ret = hkdf_expand_label(hmac_tfm, + label, strlen(label), + ctx, strlen(ctx), + tls_key, psk_len); if (ret) { kfree(tls_key); - goto out_free_info; + goto out_free_ctx; } *ret_psk = tls_key; -out_free_info: - kfree(info); +out_free_ctx: + kfree(ctx); out_free_prk: kfree(prk); out_free_shash: diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 201fc8809a62..012fcfc79a73 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -331,9 +331,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - if (ctrl->opts->concat) + if (ctrl->opts->concat) { chap->s2 = 0; - else + chap->bi_directional = false; + } else chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6b7493934535..fa4181d7de73 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1807,12 +1807,12 @@ static void nvme_release(struct gendisk *disk) nvme_ns_release(disk->private_data); } -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } @@ -3167,6 +3167,11 @@ static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl) return ctrl->cntrltype == NVME_CTRL_ADMIN; } +static inline bool nvme_is_io_ctrl(struct nvme_ctrl *ctrl) +{ + return !nvme_discovery_ctrl(ctrl) && !nvme_admin_ctrl(ctrl); +} + static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { @@ -3369,7 +3374,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) else ctrl->max_zeroes_sectors = 0; - if (ctrl->subsys->subtype != NVME_NQN_NVME || + if (!nvme_is_io_ctrl(ctrl) || !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) || test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags)) return 0; @@ -3491,14 +3496,14 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct return -EINVAL; } - if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) { + if (nvme_is_io_ctrl(ctrl) && ctrl->ioccsz < 4) { dev_err(ctrl->device, "I/O queue command capsule supported size %d < 4\n", ctrl->ioccsz); return -EINVAL; } - if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) { + if (nvme_is_io_ctrl(ctrl) && ctrl->iorcsz < 1) { dev_err(ctrl->device, "I/O queue response capsule supported size %d < 1\n", ctrl->iorcsz); @@ -4990,8 +4995,14 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) * checking that they started once before, hence are reconnecting back. */ if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && - nvme_discovery_ctrl(ctrl)) + nvme_discovery_ctrl(ctrl)) { + if (!ctrl->kato) { + nvme_stop_keep_alive(ctrl); + ctrl->kato = NVME_DEFAULT_KATO; + nvme_start_keep_alive(ctrl); + } nvme_change_uevent(ctrl, "NVME_EVENT=rediscover"); + } if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3e12d4683ac7..03987f497a5b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3032,11 +3032,17 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ++ctrl->ctrl.nr_reconnects; - if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + spin_lock_irqsave(&ctrl->rport->lock, flags); + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&ctrl->rport->lock, flags); return -ENODEV; + } - if (nvme_fc_ctlr_active_on_rport(ctrl)) + if (nvme_fc_ctlr_active_on_rport(ctrl)) { + spin_unlock_irqrestore(&ctrl->rport->lock, flags); return -ENOTUNIQ; + } + spin_unlock_irqrestore(&ctrl->rport->lock, flags); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: create association : host wwpn 0x%016llx " diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e28bb9113f64..c212fa952c0f 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -142,14 +142,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 0, rq_data_dir(req)); - if (ret) return ret; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (has_metadata) { ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cfd2b5b90b91..102fae6a231c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns **id); -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c6d9506b172..c916176bd9f0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -172,9 +172,7 @@ struct nvme_dev { u32 last_ps; bool hmb; struct sg_table *hmb_sgt; - mempool_t *dmavec_mempool; - mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -261,6 +259,9 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + + /* Metadata using non-coalesced MPTR */ + IOD_SINGLE_META_SEGMENT = 1U << 5, }; struct nvme_dma_vec { @@ -284,7 +285,8 @@ struct nvme_iod { unsigned int nr_dma_vecs; dma_addr_t meta_dma; - struct sg_table meta_sgt; + unsigned int meta_total_len; + struct dma_iova_state meta_dma_state; struct nvme_sgl_desc *meta_descriptor; }; @@ -641,6 +643,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } +static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd) +{ + return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG; +} + static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) { return cmd->common.flags & @@ -690,25 +697,52 @@ static void nvme_free_prps(struct request *req) mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); } -static void nvme_free_sgls(struct request *req) +static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, + struct nvme_sgl_desc *sg_list) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum dma_data_direction dir = rq_dma_dir(req); + unsigned int len = le32_to_cpu(sge->length); struct device *dma_dev = nvmeq->dev->dev; - dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); - unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); - struct nvme_sgl_desc *sg_list = iod->descriptors[0]; + unsigned int i; + + if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { + dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir); + return; + } + + for (i = 0; i < len / sizeof(*sg_list); i++) + dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir); +} + +static void nvme_unmap_metadata(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; enum dma_data_direction dir = rq_dma_dir(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct device *dma_dev = nvmeq->dev->dev; + struct nvme_sgl_desc *sge = iod->meta_descriptor; - if (iod->nr_descriptors) { - unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; + if (iod->flags & IOD_SINGLE_META_SEGMENT) { + dma_unmap_page(dma_dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, + rq_dma_dir(req)); + return; + } - for (i = 0; i < nr_entries; i++) - dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), - le32_to_cpu(sg_list[i].length), dir); - } else { - dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); + if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len)) { + if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) + nvme_free_sgls(req, sge, &sge[1]); + else + dma_unmap_page(dma_dev, iod->meta_dma, + iod->meta_total_len, dir); } + + if (iod->meta_descriptor) + dma_pool_free(nvmeq->descriptor_pools.small, + iod->meta_descriptor, iod->meta_dma); } static void nvme_unmap_data(struct request *req) @@ -727,7 +761,8 @@ static void nvme_unmap_data(struct request *req) if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) - nvme_free_sgls(req); + nvme_free_sgls(req, iod->descriptors[0], + &iod->cmd.common.dptr.sgl); else nvme_free_prps(req); } @@ -1007,70 +1042,70 @@ static blk_status_t nvme_map_data(struct request *req) return nvme_pci_setup_data_prp(req, &iter); } -static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, - struct scatterlist *sg) -{ - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); - sge->type = NVME_SGL_FMT_DATA_DESC << 4; -} - static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; + unsigned int entries = req->nr_integrity_segments; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = nvmeq->dev; struct nvme_sgl_desc *sg_list; - struct scatterlist *sgl, *sg; - unsigned int entries; + struct blk_dma_iter iter; dma_addr_t sgl_dma; - int rc, i; + int i = 0; - iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); - if (!iod->meta_sgt.sgl) - return BLK_STS_RESOURCE; + if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, + &iod->meta_dma_state, &iter)) + return iter.status; - sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); - iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, - iod->meta_sgt.sgl); - if (!iod->meta_sgt.orig_nents) - goto out_free_sg; + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + entries = 1; - rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) - goto out_free_sg; + /* + * The NVMe MPTR descriptor has an implicit length that the host and + * device must agree on to avoid data/memory corruption. We trust the + * kernel allocated correctly based on the format's parameters, so use + * the more efficient MPTR to avoid extra dma pool allocations for the + * SGL indirection. + * + * But for user commands, we don't necessarily know what they do, so + * the driver can't validate the metadata buffer size. The SGL + * descriptor provides an explicit length, so we're relying on that + * mechanism to catch any misunderstandings between the application and + * device. + */ + if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) { + iod->cmd.common.metadata = cpu_to_le64(iter.addr); + iod->meta_total_len = iter.len; + iod->meta_dma = iter.addr; + iod->meta_descriptor = NULL; + return BLK_STS_OK; + } sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, &sgl_dma); if (!sg_list) - goto out_unmap_sg; + return BLK_STS_RESOURCE; - entries = iod->meta_sgt.nents; iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - iod->cmd.common.flags = NVME_CMD_SGL_METASEG; iod->cmd.common.metadata = cpu_to_le64(sgl_dma); - - sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data_sg(sg_list, sgl); + iod->meta_total_len = iter.len; + nvme_pci_sgl_set_data(sg_list, &iter); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); - nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); - for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); - - return BLK_STS_OK; + do { + nvme_pci_sgl_set_data(&sg_list[++i], &iter); + iod->meta_total_len += iter.len; + } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter)); -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); - return BLK_STS_RESOURCE; + nvme_pci_sgl_set_seg(sg_list, sgl_dma, i); + if (unlikely(iter.status)) + nvme_unmap_metadata(req); + return iter.status; } static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) @@ -1083,6 +1118,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); + iod->flags |= IOD_SINGLE_META_SEGMENT; return BLK_STS_OK; } @@ -1104,7 +1140,7 @@ static blk_status_t nvme_prep_rq(struct request *req) iod->flags = 0; iod->nr_descriptors = 0; iod->total_len = 0; - iod->meta_sgt.nents = 0; + iod->meta_total_len = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) @@ -1215,25 +1251,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - - if (!iod->meta_sgt.nents) { - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req).bv_len, - rq_dma_dir(req)); - return; - } - - dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, - iod->meta_dma); - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); -} - static __always_inline void nvme_pci_unmap_rq(struct request *req) { if (blk_integrity_rq(req)) @@ -3039,7 +3056,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; dev->dmavec_mempool = mempool_create_node(1, @@ -3048,17 +3064,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->dmavec_mempool) return -ENOMEM; - - dev->iod_meta_mempool = mempool_create_node(1, - mempool_kmalloc, mempool_kfree, - (void *)meta_size, GFP_KERNEL, - dev_to_node(dev->dev)); - if (!dev->iod_meta_mempool) - goto free; return 0; -free: - mempool_destroy(dev->dmavec_mempool); - return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -3324,10 +3330,12 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND * because of high power consumption (> 2 Watt) in s2idle * sleep. Only some boards with Intel CPU are affected. + * (Note for testing: Samsung 990 Evo Plus has same PCI ID) */ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || dmi_match(DMI_BOARD_NAME, "GMxPXxx") || dmi_match(DMI_BOARD_NAME, "GXxMRXx") || + dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || dmi_match(DMI_BOARD_NAME, "PH4PG31") || dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) @@ -3508,7 +3516,6 @@ out_disable: nvme_free_queues(dev, 0); out_release_iod_mempool: mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3572,7 +3579,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0fe8cfb7229..1413788ca7d5 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2250,6 +2250,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_tagset; + if (ctrl->opts->concat && !ctrl->tls_pskid) + return 0; + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 0dd7bd99afa3..5d7d483bfbe3 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -513,9 +513,6 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) return 0; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct nvmet_ns *ns) { @@ -523,6 +520,8 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct pci_dev *p2p_dev; int ret; + lockdep_assert_held(&ctrl->subsys->lock); + if (!ctrl->p2p_client || !ns->use_p2pmem) return; @@ -1539,15 +1538,14 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) return false; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, struct device *p2p_client) { struct nvmet_ns *ns; unsigned long idx; + lockdep_assert_held(&ctrl->subsys->lock); + if (!p2p_client) return; @@ -1557,14 +1555,13 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, nvmet_p2pmem_ns_add_p2p(ctrl, ns); } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) { struct radix_tree_iter iter; void __rcu **slot; + lockdep_assert_held(&ctrl->subsys->lock); + radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) pci_dev_put(radix_tree_deref_slot(slot)); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a9b18c051f5b..7d84527d5a43 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -54,6 +54,8 @@ struct nvmet_fc_ls_req_op { /* for an LS RQST XMT */ int ls_error; struct list_head lsreq_list; /* tgtport->ls_req_list */ bool req_queued; + + struct work_struct put_work; }; @@ -111,8 +113,6 @@ struct nvmet_fc_tgtport { struct nvmet_fc_port_entry *pe; struct kref ref; u32 max_sg_cnt; - - struct work_struct put_work; }; struct nvmet_fc_port_entry { @@ -235,12 +235,13 @@ static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue); static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue); static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); -static void nvmet_fc_put_tgtport_work(struct work_struct *work) +static void nvmet_fc_put_lsop_work(struct work_struct *work) { - struct nvmet_fc_tgtport *tgtport = - container_of(work, struct nvmet_fc_tgtport, put_work); + struct nvmet_fc_ls_req_op *lsop = + container_of(work, struct nvmet_fc_ls_req_op, put_work); - nvmet_fc_tgtport_put(tgtport); + nvmet_fc_tgtport_put(lsop->tgtport); + kfree(lsop); } static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, @@ -367,7 +368,7 @@ __nvmet_fc_finish_ls_req(struct nvmet_fc_ls_req_op *lsop) DMA_BIDIRECTIONAL); out_putwork: - queue_work(nvmet_wq, &tgtport->put_work); + queue_work(nvmet_wq, &lsop->put_work); } static int @@ -388,6 +389,7 @@ __nvmet_fc_send_ls_req(struct nvmet_fc_tgtport *tgtport, lsreq->done = done; lsop->req_queued = false; INIT_LIST_HEAD(&lsop->lsreq_list); + INIT_WORK(&lsop->put_work, nvmet_fc_put_lsop_work); lsreq->rqstdma = fc_dma_map_single(tgtport->dev, lsreq->rqstaddr, lsreq->rqstlen + lsreq->rsplen, @@ -447,8 +449,6 @@ nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status) __nvmet_fc_finish_ls_req(lsop); /* fc-nvme target doesn't care about success or failure of cmd */ - - kfree(lsop); } /* @@ -1075,6 +1075,14 @@ nvmet_fc_delete_assoc_work(struct work_struct *work) static void nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) { + int terminating; + + terminating = atomic_xchg(&assoc->terminating, 1); + + /* if already terminating, do nothing */ + if (terminating) + return; + nvmet_fc_tgtport_get(assoc->tgtport); if (!queue_work(nvmet_wq, &assoc->del_work)) nvmet_fc_tgtport_put(assoc->tgtport); @@ -1202,13 +1210,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) { struct nvmet_fc_tgtport *tgtport = assoc->tgtport; unsigned long flags; - int i, terminating; - - terminating = atomic_xchg(&assoc->terminating, 1); - - /* if already terminating, do nothing */ - if (terminating) - return; + int i; spin_lock_irqsave(&tgtport->lock, flags); list_del_rcu(&assoc->a_list); @@ -1410,7 +1412,6 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, kref_init(&newrec->ref); ida_init(&newrec->assoc_cnt); newrec->max_sg_cnt = template->max_sgl_segments; - INIT_WORK(&newrec->put_work, nvmet_fc_put_tgtport_work); ret = nvmet_fc_alloc_ls_iodlist(newrec); if (ret) { diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 257b497d515a..5dffcc5becae 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -496,13 +496,15 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, if (!targetport) { /* * The target port is gone. The target doesn't expect any - * response anymore and the ->done call is not valid - * because the resources have been freed by - * nvmet_fc_free_pending_reqs. + * response anymore and thus lsreq can't be accessed anymore. * * We end up here from delete association exchange: * nvmet_fc_xmt_disconnect_assoc sends an async request. + * + * Return success because this is what LLDDs do; silently + * drop the response. */ + lsrsp->done(lsrsp); kmem_cache_free(lsreq_cache, tls_req); return 0; } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 506a947d00a5..7765e40f7cea 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -334,6 +334,11 @@ static int dasd_state_basic_to_ready(struct dasd_device *device) lim.max_dev_sectors = device->discipline->max_sectors(block); lim.max_hw_sectors = lim.max_dev_sectors; lim.logical_block_size = block->bp_block; + /* + * Adjust dma_alignment to match block_size - 1 + * to ensure proper buffer alignment checks in the block layer. + */ + lim.dma_alignment = lim.logical_block_size - 1; if (device->discipline->has_discard) { unsigned int max_bytes; @@ -3114,12 +3119,14 @@ static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, PTR_ERR(cqr) == -ENOMEM || PTR_ERR(cqr) == -EAGAIN) { rc = BLK_STS_RESOURCE; - goto out; + } else if (PTR_ERR(cqr) == -EINVAL) { + rc = BLK_STS_INVAL; + } else { + DBF_DEV_EVENT(DBF_ERR, basedev, + "CCW creation failed (rc=%ld) on request %p", + PTR_ERR(cqr), req); + rc = BLK_STS_IOERR; } - DBF_DEV_EVENT(DBF_ERR, basedev, - "CCW creation failed (rc=%ld) on request %p", - PTR_ERR(cqr), req); - rc = BLK_STS_IOERR; goto out; } /* @@ -3317,11 +3324,11 @@ static void dasd_release(struct gendisk *disk) /* * Return disk geometry. */ -static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct dasd_device *base; - base = dasd_device_from_gendisk(bdev->bd_disk); + base = dasd_device_from_gendisk(disk); if (!base) return -ENODEV; @@ -3331,7 +3338,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EINVAL; } base->discipline->fill_geometry(base->block, geo); - geo->start = get_start_sect(bdev) >> base->block->s2b_shift; + // geo->start is left unchanged by the above + geo->start >>= base->block->s2b_shift; dasd_put_device(base); return 0; } diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 883d4a12a172..a377a6f6900a 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1695,7 +1695,7 @@ out: } /* End twa_reset_sequence() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 8d4174c7107e..e319be7d369c 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1404,7 +1404,7 @@ out: } /* End twl_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 89bd56f78ef9..0306a228c702 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev) } /* End tw_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 82597bd96525..a86d780d1ba4 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset) the BIOS, and a warning may be displayed. */ -static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, +static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *params) { struct blogic_adapter *adapter = @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev); + buf = scsi_bios_ptable(disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h index 61bf26d4fc10..79de815e33b0 100644 --- a/drivers/scsi/BusLogic.h +++ b/drivers/scsi/BusLogic.h @@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets, static const char *blogic_drvr_info(struct Scsi_Host *); static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *); -static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *); +static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *); static int blogic_sdev_configure(struct scsi_device *, struct queue_limits *lim); static void blogic_qcompleted_ccb(struct blogic_ccb *); diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 4b12e6dd8f07..ea66196ef7c7 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) /** * aac_biosparm - return BIOS parameters for disk * @sdev: The scsi device corresponding to the disk - * @bdev: the block device corresponding to the disk + * @disk: the gendisk corresponding to the disk * @capacity: the sector capacity of the disk * @geom: geometry block to fill in * @@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) * be displayed. */ -static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, +static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *geom) { struct diskparm *param = (struct diskparm *)geom; @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index 3a2c336307c0..063e1b5818d3 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp) * ip[2]: cylinders */ static int -advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev, +advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { struct asc_board *boardp = shost_priv(sdev->host); diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index e94c0a19c435..182aa80ec4c6 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt) * Return the "logical geometry" * */ -static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *info_array) { struct Scsi_Host *shpnt = sdev->host; @@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev int info[3]; /* try to figure out the geometry from the partition table */ - if (scsicam_bios_param(bdev, capacity, info) < 0 || + if (scsicam_bios_param(disk, capacity, info) < 0 || !((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) { if (EXT_TRANS) { printk(KERN_NOTICE diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c index 389499d3e00a..371e8300f029 100644 --- a/drivers/scsi/aha1542.c +++ b/drivers/scsi/aha1542.c @@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd) } static int aha1542_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { struct aha1542_hostdata *aha1542 = shost_priv(sdev->host); diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be7ebbbb9ba8..b234621f6b37 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level, } static int aha1740_biosparam(struct scsi_device *sdev, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int* ip) { int size = capacity; diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 17dfc3c72110..c3d1b9dd24ae 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index cebf8c5d0caf..8b2b98666d61 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index fb57343a97bd..f0c5a30ce51b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb); static int arcmsr_abort(struct scsi_cmnd *); static int arcmsr_bus_reset(struct scsi_cmnd *); static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *info); + struct gendisk *disk, sector_t capacity, int *info); static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd); static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id); @@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id) } static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *geom) + struct gendisk *disk, sector_t capacity, int *geom) { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 401242912855..df6f40b51deb 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr) } -static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev, +static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int *ip) { int heads, sectors, cylinders; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 504c4e0c5d17..c0b2a980db34 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd) } static int fdomain_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, + struct gendisk *disk, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev); + unsigned char *p = scsi_bios_ptable(disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c index 0821cf994b98..5c602c057798 100644 --- a/drivers/scsi/imm.c +++ b/drivers/scsi/imm.c @@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 8648bd965287..ed34ad92c807 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) /** * i91u_biosparam - return the "logical geometry * @sdev: SCSI device - * @dev: Matching block device + * @unused: Matching gendisk * @capacity: Sector size of drive * @info_array: Return space for BIOS geometry * @@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) * FIXME: limited to 2^32 sector devices. */ -static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info_array) { struct initio_host *host; /* Point to Host adapter control block */ diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index d06b79f03538..dd6754db7e4c 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev); /** * ipr_biosparam - Return the HSC mapping - * @sdev: scsi device struct - * @block_device: block device pointer + * @sdev: scsi device struct + * @unused: gendisk pointer * @capacity: capacity of the device - * @parm: Array containing returned HSC values. + * @parm: Array containing returned HSC values. * * This function generates the HSC parms that fdisk uses. * We want to make sure we return something that places partitions @@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev); * 0 on success **/ static int ipr_biosparam(struct scsi_device *sdev, - struct block_device *block_device, + struct gendisk *unused, sector_t capacity, int *parm) { int heads, sectors; diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 94adb6ac02a4..3393a288fd23 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue) /* Set bios geometry for the controller */ /* */ /****************************************************************************/ -static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 8ac932ec4444..30a4d4a580e9 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -398,7 +398,7 @@ /* * Scsi_Host Template */ - static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, + static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); static int ips_sdev_configure(struct scsi_device *SDptr, struct queue_limits *lim); diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 928723c90b75..ffa5b49aaf08 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth) EXPORT_SYMBOL_GPL(sas_change_queue_depth); int sas_bios_param(struct scsi_device *scsi_dev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int *hsc) { hsc[0] = 255; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 2006094af418..a00622c0c526 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare * Return the disk geometry for a particular disk */ static int -megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, +megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { adapter_t *adapter; @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h index 013fbfb911b9..d6bfd26a8843 100644 --- a/drivers/scsi/megaraid.h +++ b/drivers/scsi/megaraid.h @@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *); static int megaraid_abort(struct scsi_cmnd *); static int megaraid_reset(struct scsi_cmnd *); static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int); -static int megaraid_biosparam(struct scsi_device *, struct block_device *, +static int megaraid_biosparam(struct scsi_device *, struct gendisk *, sector_t, int []); static int mega_build_sglist (adapter_t *adapter, scb_t *scb, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 615e06fd4ee8..abbbc4b36cd1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd) /** * megasas_bios_param - Returns disk geometry for a disk * @sdev: device handle - * @bdev: block device + * @unused: gendisk * @capacity: drive capacity * @geom: geometry parameters */ static int -megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, +megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index e467b56949e9..3df52a3b435b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -4031,7 +4031,7 @@ out: /** * mpi3mr_bios_param - BIOS param callback * @sdev: SCSI device reference - * @bdev: Block device reference + * @unused: gendisk reference * @capacity: Capacity in logical sectors * @params: Parameter array * @@ -4040,7 +4040,7 @@ out: * Return: 0 always */ static int mpi3mr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int params[]) + struct gendisk *unused, sector_t capacity, int params[]) { int heads; int sectors; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 967af259118e..7092d0debef3 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) /** * scsih_bios_param - fetch head, sector, cylinder info for a disk * @sdev: scsi device struct - * @bdev: pointer to block device context + * @unused: pointer to gendisk * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * params[2] number of cylinders */ static int -scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev, +scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int params[]) { int heads; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index 96549e7f5705..bdc2f2f17753 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd) } static int -mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev, +mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index 486db5b2f05d..b8453c0333dc 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev) kfree(sdev->hostdata); } -static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { struct myrb_hba *cb = shost_priv(sdev->host); diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 278c78d066c4..a3b505240351 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c @@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt) static int SYM53C500_biosparm(struct scsi_device *disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int *info_array) { int size; diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c index 1ed3171f1797..ea682f3044b6 100644 --- a/drivers/scsi/ppa.c +++ b/drivers/scsi/ppa.c @@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 6af018f1ca22..ef841f643171 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd) } static int -qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev, +qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c index 3e065d5fc80c..1ce469b7db99 100644 --- a/drivers/scsi/qlogicfas408.c +++ b/drivers/scsi/qlogicfas408.c @@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand) * Return bios parameters */ -int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev, +int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int ip[]) { /* This should mimic the DOS Qlogic driver's behavior exactly */ diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h index a971db11d293..83ef86c71f2f 100644 --- a/drivers/scsi/qlogicfas408.h +++ b/drivers/scsi/qlogicfas408.h @@ -106,7 +106,7 @@ struct qlogicfas408_priv { irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id); int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd); int qlogicfas408_biosparam(struct scsi_device * disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int ip[]); int qlogicfas408_abort(struct scsi_cmnd * cmd); extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd); diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 19e6c3852d50..887de505bcf9 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -30,9 +30,9 @@ * starting at offset %0x1be. * Returns: partition table in kmalloc(GFP_KERNEL) memory, or NULL on error. */ -unsigned char *scsi_bios_ptable(struct block_device *dev) +unsigned char *scsi_bios_ptable(struct gendisk *dev) { - struct address_space *mapping = bdev_whole(dev)->bd_mapping; + struct address_space *mapping = dev->part0->bd_mapping; unsigned char *res = NULL; struct folio *folio; @@ -48,7 +48,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); /** * scsi_partsize - Parse cylinders/heads/sectors from PC partition table - * @bdev: block device to parse + * @disk: gendisk of the disk to parse * @capacity: size of the disk in sectors * @geom: output in form of [hds, cylinders, sectors] * @@ -57,7 +57,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); * * Returns: %false on failure, %true on success. */ -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]) { int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; @@ -65,7 +65,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) void *buf; int ret = false; - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(disk); if (!buf) return false; @@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds /** * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. - * @bdev: which device + * @disk: which device * @capacity: size of the disk in sectors * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders * @@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds * * Returns : -1 on failure, 0 on success. */ -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip) { u64 capacity64 = capacity; /* Suppress gcc warning */ int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev, capacity, ip)) + if (scsi_partsize(disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b8668accf8e..00ad574ce61c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk) scsi_device_put(sdev); } -static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); + struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev, capacity, diskinfo); + host->hostt->bios_param(sdp, disk, capacity, diskinfo); else - scsicam_bios_param(bdev, capacity, diskinfo); + scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index 63ed7f9aaa93..d8ad02c29320 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work) } static int stex_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { int heads = 255, sectors = 63; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index d9e59204a9c3..dc51ea352198 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice, return 0; } -static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev, +static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info) { sector_t nsect = capacity; diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c index 5a380eecfc75..0c9987828774 100644 --- a/drivers/scsi/wd719x.c +++ b/drivers/scsi/wd719x.c @@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd) return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED; } -static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { if (capacity >= 0x200000) { diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index f991cf759836..db4e09042469 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -861,7 +861,7 @@ new_bio: bio = bio_kmalloc(nr_vecs, GFP_KERNEL); if (!bio) goto fail; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, + bio_init_inline(bio, NULL, nr_vecs, rw ? REQ_OP_WRITE : REQ_OP_READ); bio->bi_end_io = pscsi_bi_endio; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 46aa85af13dc..9802b2cc29bb 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -337,8 +337,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) u64 copied = 0; size_t orig_count; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || - !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { @@ -434,7 +433,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter); + ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b69c294e3ef0..a05e3793f93a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -231,7 +231,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); + bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0a25716820fe..851254f36eb3 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,6 +13,7 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ + BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { diff --git a/include/linux/bio.h b/include/linux/bio.h index 46ffac5caab7..a64a30131031 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes); +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align); /** * bio_next_split - get next @sectors from a bio, splitting if necessary @@ -405,6 +405,11 @@ struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); +static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, + unsigned short max_vecs, blk_opf_t opf) +{ + bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf); +} extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); @@ -441,7 +446,14 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask); + +static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, 0); +} + void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f11..b659373788f6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include <linux/blk-mq.h> #include <linux/bio-integrity.h> +#include <linux/blk-mq-dma.h> struct request; @@ -26,11 +27,25 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); + +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); +} + int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -109,12 +124,29 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return false; +} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae00..51829958d872 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,13 @@ #include <linux/blk-mq.h> #include <linux/pci-p2pdma.h> +struct blk_map_iter { + struct bvec_iter iter; + struct bio *bio; + struct bio_vec *bvecs; + bool is_integrity; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +21,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; @@ -36,19 +43,20 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_rq_dma_unmap - try to DMA unmap a request + * blk_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) +static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { @@ -60,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, return !dma_need_unmap(dma_dev); } +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + req->cmd_flags & REQ_P2PDMA); +} + #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..b25d12545f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,8 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent + * use-after-free when iterating tags. * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. @@ -531,6 +533,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + struct srcu_struct tags_srcu; struct rw_semaphore update_nr_hwq_lock; }; @@ -767,6 +770,7 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + struct rcu_head rcu_head; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd36..8e8d1cc8b06c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif @@ -269,18 +266,16 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; - - /* - * We can inline a number of vecs at the end of the bio, to avoid - * double allocations for a small number of bio_vecs. This member - * MUST obviously be kept at the very end of the bio. - */ - struct bio_vec bi_inline_vecs[]; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +static inline struct bio_vec *bio_inline_vecs(struct bio *bio) +{ + return (struct bio_vec *)(bio + 1); +} + /* * bio flags */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..066e5309bd45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ QUEUE_FLAG_MAX }; @@ -999,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); @@ -1590,13 +1593,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) -{ - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); -} - static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { @@ -1660,7 +1656,7 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ @@ -1870,6 +1866,20 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev) return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + +static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 0620dd67369f..21de0935775d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e86c653186c..5b127043a151 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..9c6e90829dbd 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); -extern int sas_bios_param(struct scsi_device *, struct block_device *, +extern int sas_bios_param(struct scsi_device *, struct gendisk *, sector_t capacity, int *hsc); int sas_execute_internal_abort_single(struct domain_device *device, u16 tag, unsigned int qid, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index c53812b9026f..f5a243261236 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -318,7 +318,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* bios_param)(struct scsi_device *, struct block_device *, + int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 08edd603e521..1131f51ed2c8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -13,7 +13,8 @@ #ifndef SCSICAM_H #define SCSICAM_H -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]); -unsigned char *scsi_bios_ptable(struct block_device *bdev); +struct gendisk; +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip); +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); +unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9193f952f49..2fe66a6b8789 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -784,101 +784,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); -static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct iovec *iov = iter_iov(i); - size_t size = i->count; - size_t skip = i->iov_offset; - - do { - size_t len = iov->iov_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(iov->iov_base + skip) & addr_mask) - return false; - - iov++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct bio_vec *bvec = i->bvec; - unsigned skip = i->iov_offset; - size_t size = i->count; - - do { - size_t len = bvec->bv_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) - return false; - - bvec++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -/** - * iov_iter_is_aligned() - Check if the addresses and lengths of each segments - * are aligned to the parameters. - * - * @i: &struct iov_iter to restore - * @addr_mask: bit mask to check against the iov element's addresses - * @len_mask: bit mask to check against the iov element's lengths - * - * Return: false if any addresses or lengths intersect with the provided masks - */ -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - if (likely(iter_is_ubuf(i))) { - if (i->count & len_mask) - return false; - if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) - return false; - return true; - } - - if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return iov_iter_aligned_iovec(i, addr_mask, len_mask); - - if (iov_iter_is_bvec(i)) - return iov_iter_aligned_bvec(i, addr_mask, len_mask); - - /* With both xarray and folioq types, we're dealing with whole folios. */ - if (iov_iter_is_xarray(i)) { - if (i->count & len_mask) - return false; - if ((i->xarray_start + i->iov_offset) & addr_mask) - return false; - } - if (iov_iter_is_folioq(i)) { - if (i->count & len_mask) - return false; - if (i->iov_offset & addr_mask) - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(iov_iter_is_aligned); - static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs index 150f710efe5b..32c8d865afb6 100644 --- a/rust/kernel/block.rs +++ b/rust/kernel/block.rs @@ -3,3 +3,16 @@ //! Types for working with the block layer. pub mod mq; + +/// Bit mask for masking out [`SECTOR_SIZE`]. +pub const SECTOR_MASK: u32 = bindings::SECTOR_MASK; + +/// Sectors are size `1 << SECTOR_SHIFT`. +pub const SECTOR_SHIFT: u32 = bindings::SECTOR_SHIFT; + +/// Size of a sector. +pub const SECTOR_SIZE: u32 = bindings::SECTOR_SIZE; + +/// The difference between the size of a page and the size of a sector, +/// expressed as a power of two. +pub const PAGE_SECTORS_SHIFT: u32 = bindings::PAGE_SECTORS_SHIFT; diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 61ea35bba7d5..637018ead0ab 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -69,27 +69,33 @@ //! //! #[vtable] //! impl Operations for MyBlkDevice { +//! type QueueData = (); //! -//! fn queue_rq(rq: ARef<Request<Self>>, _is_last: bool) -> Result { +//! fn queue_rq(_queue_data: (), rq: ARef<Request<Self>>, _is_last: bool) -> Result { //! Request::end_ok(rq); //! Ok(()) //! } //! -//! fn commit_rqs() {} +//! fn commit_rqs(_queue_data: ()) {} +//! +//! fn complete(rq: ARef<Request<Self>>) { +//! Request::end_ok(rq) +//! .map_err(|_e| kernel::error::code::EIO) +//! .expect("Fatal error - expected to be able to end request"); +//! } //! } //! //! let tagset: Arc<TagSet<MyBlkDevice>> = //! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; //! let mut disk = gen_disk::GenDiskBuilder::new() //! .capacity_sectors(4096) -//! .build(fmt!("myblk"), tagset)?; +//! .build(fmt!("myblk"), tagset, ())?; //! //! # Ok::<(), kernel::error::Error>(()) //! ``` pub mod gen_disk; mod operations; -mod raw_writer; mod request; mod tag_set; diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index be92d0e5f031..1ce815c8cdab 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -5,10 +5,17 @@ //! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) //! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) -use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; -use crate::fmt::{self, Write}; -use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; -use crate::{error, static_lock_class}; +use crate::{ + bindings, + block::mq::{Operations, TagSet}, + error::{self, from_err_ptr, Result}, + fmt::{self, Write}, + prelude::*, + static_lock_class, + str::NullTerminatedFormatter, + sync::Arc, + types::{ForeignOwnable, ScopeGuard}, +}; /// A builder for [`GenDisk`]. /// @@ -45,7 +52,7 @@ impl GenDiskBuilder { /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, /// and that it is a power of two. - fn validate_block_size(size: u32) -> Result { + pub fn validate_block_size(size: u32) -> Result { if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { Err(error::code::EINVAL) } else { @@ -92,7 +99,14 @@ impl GenDiskBuilder { self, name: fmt::Arguments<'_>, tagset: Arc<TagSet<T>>, + queue_data: T::QueueData, ) -> Result<GenDisk<T>> { + let data = queue_data.into_foreign(); + let recover_data = ScopeGuard::new(|| { + // SAFETY: T::QueueData was created by the call to `into_foreign()` above + drop(unsafe { T::QueueData::from_foreign(data) }); + }); + // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; @@ -107,7 +121,7 @@ impl GenDiskBuilder { bindings::__blk_mq_alloc_disk( tagset.raw_tag_set(), &mut lim, - core::ptr::null_mut(), + data, static_lock_class!().as_ptr(), ) })?; @@ -139,14 +153,14 @@ impl GenDiskBuilder { // SAFETY: `gendisk` is a valid pointer as we initialized it above unsafe { (*gendisk).fops = &TABLE }; - let mut raw_writer = RawWriter::from_array( + let mut writer = NullTerminatedFormatter::new( // SAFETY: `gendisk` points to a valid and initialized instance. We // have exclusive access, since the disk is not added to the VFS // yet. unsafe { &mut (*gendisk).disk_name }, - )?; - raw_writer.write_fmt(name)?; - raw_writer.write_char('\0')?; + ) + .ok_or(EINVAL)?; + writer.write_fmt(name)?; // SAFETY: `gendisk` points to a valid and initialized instance of // `struct gendisk`. `set_capacity` takes a lock to synchronize this @@ -161,8 +175,12 @@ impl GenDiskBuilder { }, )?; + recover_data.dismiss(); + // INVARIANT: `gendisk` was initialized above. // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above. + // INVARIANT: `gendisk.queue.queue_data` is set to `data` in the call to + // `__blk_mq_alloc_disk` above. Ok(GenDisk { _tagset: tagset, gendisk, @@ -174,9 +192,10 @@ impl GenDiskBuilder { /// /// # Invariants /// -/// - `gendisk` must always point to an initialized and valid `struct gendisk`. -/// - `gendisk` was added to the VFS through a call to -/// `bindings::device_add_disk`. +/// - `gendisk` must always point to an initialized and valid `struct gendisk`. +/// - `gendisk` was added to the VFS through a call to +/// `bindings::device_add_disk`. +/// - `self.gendisk.queue.queuedata` is initialized by a call to `ForeignOwnable::into_foreign`. pub struct GenDisk<T: Operations> { _tagset: Arc<TagSet<T>>, gendisk: *mut bindings::gendisk, @@ -188,9 +207,20 @@ unsafe impl<T: Operations + Send> Send for GenDisk<T> {} impl<T: Operations> Drop for GenDisk<T> { fn drop(&mut self) { + // SAFETY: By type invariant of `Self`, `self.gendisk` points to a valid + // and initialized instance of `struct gendisk`, and, `queuedata` was + // initialized with the result of a call to + // `ForeignOwnable::into_foreign`. + let queue_data = unsafe { (*(*self.gendisk).queue).queuedata }; + // SAFETY: By type invariant, `self.gendisk` points to a valid and // initialized instance of `struct gendisk`, and it was previously added // to the VFS. unsafe { bindings::del_gendisk(self.gendisk) }; + + // SAFETY: `queue.queuedata` was created by `GenDiskBuilder::build` with + // a call to `ForeignOwnable::into_foreign` to create `queuedata`. + // `ForeignOwnable::from_foreign` is only called here. + drop(unsafe { T::QueueData::from_foreign(queue_data) }); } } diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index c0f95a9419c4..f91a1719886c 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -6,15 +6,16 @@ use crate::{ bindings, - block::mq::request::RequestDataWrapper, - block::mq::Request, + block::mq::{request::RequestDataWrapper, Request}, error::{from_result, Result}, prelude::*, sync::Refcount, - types::ARef, + types::{ARef, ForeignOwnable}, }; use core::marker::PhantomData; +type ForeignBorrowed<'a, T> = <T as ForeignOwnable>::Borrowed<'a>; + /// Implement this trait to interface blk-mq as block devices. /// /// To implement a block device driver, implement this trait as described in the @@ -27,12 +28,23 @@ use core::marker::PhantomData; /// [module level documentation]: kernel::block::mq #[macros::vtable] pub trait Operations: Sized { + /// Data associated with the `struct request_queue` that is allocated for + /// the `GenDisk` associated with this `Operations` implementation. + type QueueData: ForeignOwnable; + /// Called by the kernel to queue a request with the driver. If `is_last` is /// `false`, the driver is allowed to defer committing the request. - fn queue_rq(rq: ARef<Request<Self>>, is_last: bool) -> Result; + fn queue_rq( + queue_data: ForeignBorrowed<'_, Self::QueueData>, + rq: ARef<Request<Self>>, + is_last: bool, + ) -> Result; /// Called by the kernel to indicate that queued requests should be submitted. - fn commit_rqs(); + fn commit_rqs(queue_data: ForeignBorrowed<'_, Self::QueueData>); + + /// Called by the kernel when the request is completed. + fn complete(rq: ARef<Request<Self>>); /// Called by the kernel to poll the device for completed requests. Only /// used for poll queues. @@ -71,7 +83,7 @@ impl<T: Operations> OperationsVTable<T> { /// promise to not access the request until the driver calls /// `bindings::blk_mq_end_request` for the request. unsafe extern "C" fn queue_rq_callback( - _hctx: *mut bindings::blk_mq_hw_ctx, + hctx: *mut bindings::blk_mq_hw_ctx, bd: *const bindings::blk_mq_queue_data, ) -> bindings::blk_status_t { // SAFETY: `bd.rq` is valid as required by the safety requirement for @@ -89,10 +101,20 @@ impl<T: Operations> OperationsVTable<T> { // reference counted by `ARef` until then. let rq = unsafe { Request::aref_from_raw((*bd).rq) }; + // SAFETY: `hctx` is valid as required by this function. + let queue_data = unsafe { (*(*hctx).queue).queuedata }; + + // SAFETY: `queue.queuedata` was created by `GenDiskBuilder::build` with + // a call to `ForeignOwnable::into_foreign` to create `queuedata`. + // `ForeignOwnable::from_foreign` is only called when the tagset is + // dropped, which happens after we are dropped. + let queue_data = unsafe { T::QueueData::borrow(queue_data) }; + // SAFETY: We have exclusive access and we just set the refcount above. unsafe { Request::start_unchecked(&rq) }; let ret = T::queue_rq( + queue_data, rq, // SAFETY: `bd` is valid as required by the safety requirement for // this function. @@ -111,18 +133,35 @@ impl<T: Operations> OperationsVTable<T> { /// /// # Safety /// - /// This function may only be called by blk-mq C infrastructure. - unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) { - T::commit_rqs() + /// This function may only be called by blk-mq C infrastructure. The caller + /// must ensure that `hctx` is valid. + unsafe extern "C" fn commit_rqs_callback(hctx: *mut bindings::blk_mq_hw_ctx) { + // SAFETY: `hctx` is valid as required by this function. + let queue_data = unsafe { (*(*hctx).queue).queuedata }; + + // SAFETY: `queue.queuedata` was created by `GenDisk::try_new()` with a + // call to `ForeignOwnable::into_foreign()` to create `queuedata`. + // `ForeignOwnable::from_foreign()` is only called when the tagset is + // dropped, which happens after we are dropped. + let queue_data = unsafe { T::QueueData::borrow(queue_data) }; + T::commit_rqs(queue_data) } - /// This function is called by the C kernel. It is not currently - /// implemented, and there is no way to exercise this code path. + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. /// /// # Safety /// - /// This function may only be called by blk-mq C infrastructure. - unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {} + /// This function may only be called by blk-mq C infrastructure. `rq` must + /// point to a valid request that has been marked as completed. The pointee + /// of `rq` must be valid for write for the duration of this function. + unsafe extern "C" fn complete_callback(rq: *mut bindings::request) { + // SAFETY: This function can only be dispatched through + // `Request::complete`. We leaked a refcount then which we pick back up + // now. + let aref = unsafe { Request::aref_from_raw(rq) }; + T::complete(aref); + } /// This function is called by the C kernel. A pointer to this function is /// installed in the `blk_mq_ops` vtable for the driver. diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs deleted file mode 100644 index d311e24e2595..000000000000 --- a/rust/kernel/block/mq/raw_writer.rs +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -use crate::error::Result; -use crate::fmt::{self, Write}; -use crate::prelude::EINVAL; - -/// A mutable reference to a byte buffer where a string can be written into. -/// -/// # Invariants -/// -/// `buffer` is always null terminated. -pub(crate) struct RawWriter<'a> { - buffer: &'a mut [u8], - pos: usize, -} - -impl<'a> RawWriter<'a> { - /// Create a new `RawWriter` instance. - fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> { - *(buffer.last_mut().ok_or(EINVAL)?) = 0; - - // INVARIANT: We null terminated the buffer above. - Ok(Self { buffer, pos: 0 }) - } - - pub(crate) fn from_array<const N: usize>( - a: &'a mut [crate::ffi::c_char; N], - ) -> Result<RawWriter<'a>> { - Self::new( - // SAFETY: the buffer of `a` is valid for read and write as `u8` for - // at least `N` bytes. - unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::<u8>(), N) }, - ) - } -} - -impl Write for RawWriter<'_> { - fn write_str(&mut self, s: &str) -> fmt::Result { - let bytes = s.as_bytes(); - let len = bytes.len(); - - // We do not want to overwrite our null terminator - if self.pos + len > self.buffer.len() - 1 { - return Err(fmt::Error); - } - - // INVARIANT: We are not overwriting the last byte - self.buffer[self.pos..self.pos + len].copy_from_slice(bytes); - - self.pos += len; - - Ok(()) - } -} diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index f62a376dc313..c5f1f6b1ccfb 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -53,7 +53,7 @@ use core::{marker::PhantomData, ptr::NonNull}; /// [`struct request`]: srctree/include/linux/blk-mq.h /// #[repr(transparent)] -pub struct Request<T: Operations>(Opaque<bindings::request>, PhantomData<T>); +pub struct Request<T>(Opaque<bindings::request>, PhantomData<T>); impl<T: Operations> Request<T> { /// Create an [`ARef<Request>`] from a [`struct request`] pointer. @@ -138,6 +138,23 @@ impl<T: Operations> Request<T> { Ok(()) } + /// Complete the request by scheduling `Operations::complete` for + /// execution. + /// + /// The function may be scheduled locally, via SoftIRQ or remotely via IPMI. + /// See `blk_mq_complete_request_remote` in [`blk-mq.c`] for details. + /// + /// [`blk-mq.c`]: srctree/block/blk-mq.c + pub fn complete(this: ARef<Self>) { + let ptr = ARef::into_raw(this).cast::<bindings::request>().as_ptr(); + // SAFETY: By type invariant, `self.0` is a valid `struct request` + if !unsafe { bindings::blk_mq_complete_request_remote(ptr) } { + // SAFETY: We released a refcount above that we can reclaim here. + let this = unsafe { Request::aref_from_raw(ptr) }; + T::complete(this); + } + } + /// Return a pointer to the [`RequestDataWrapper`] stored in the private area /// of the request structure. /// @@ -151,7 +168,7 @@ impl<T: Operations> Request<T> { // valid allocation. let wrapper_ptr = unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() }; - // SAFETY: By C API contract, wrapper_ptr points to a valid allocation + // SAFETY: By C API contract, `wrapper_ptr` points to a valid allocation // and is not null. unsafe { NonNull::new_unchecked(wrapper_ptr) } } diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs index 9fb5ef825e41..10f1547ca9f1 100644 --- a/rust/kernel/configfs.rs +++ b/rust/kernel/configfs.rs @@ -1039,3 +1039,5 @@ macro_rules! configfs_attrs { }; } + +pub use crate::configfs_attrs; diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index 6c892550c0ba..5c74e5f77601 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -2,11 +2,16 @@ //! String representations. -use crate::alloc::{flags::*, AllocError, KVec}; -use crate::fmt::{self, Write}; -use core::ops::{self, Deref, DerefMut, Index}; - -use crate::prelude::*; +use crate::{ + alloc::{flags::*, AllocError, KVec}, + error::{to_result, Result}, + fmt::{self, Write}, + prelude::*, +}; +use core::{ + marker::PhantomData, + ops::{self, Deref, DerefMut, Index}, +}; /// Byte string without UTF-8 validity guarantee. #[repr(transparent)] @@ -732,7 +737,7 @@ mod tests { /// /// The memory region between `pos` (inclusive) and `end` (exclusive) is valid for writes if `pos` /// is less than `end`. -pub(crate) struct RawFormatter { +pub struct RawFormatter { // Use `usize` to use `saturating_*` functions. beg: usize, pos: usize, @@ -790,7 +795,7 @@ impl RawFormatter { } /// Returns the number of bytes written to the formatter. - pub(crate) fn bytes_written(&self) -> usize { + pub fn bytes_written(&self) -> usize { self.pos - self.beg } } @@ -824,9 +829,9 @@ impl fmt::Write for RawFormatter { /// Allows formatting of [`fmt::Arguments`] into a raw buffer. /// /// Fails if callers attempt to write more than will fit in the buffer. -pub(crate) struct Formatter(RawFormatter); +pub struct Formatter<'a>(RawFormatter, PhantomData<&'a mut ()>); -impl Formatter { +impl Formatter<'_> { /// Creates a new instance of [`Formatter`] with the given buffer. /// /// # Safety @@ -835,11 +840,18 @@ impl Formatter { /// for the lifetime of the returned [`Formatter`]. pub(crate) unsafe fn from_buffer(buf: *mut u8, len: usize) -> Self { // SAFETY: The safety requirements of this function satisfy those of the callee. - Self(unsafe { RawFormatter::from_buffer(buf, len) }) + Self(unsafe { RawFormatter::from_buffer(buf, len) }, PhantomData) + } + + /// Create a new [`Self`] instance. + pub fn new(buffer: &mut [u8]) -> Self { + // SAFETY: `buffer` is valid for writes for the entire length for + // the lifetime of `Self`. + unsafe { Formatter::from_buffer(buffer.as_mut_ptr(), buffer.len()) } } } -impl Deref for Formatter { +impl Deref for Formatter<'_> { type Target = RawFormatter; fn deref(&self) -> &Self::Target { @@ -847,7 +859,7 @@ impl Deref for Formatter { } } -impl fmt::Write for Formatter { +impl fmt::Write for Formatter<'_> { fn write_str(&mut self, s: &str) -> fmt::Result { self.0.write_str(s)?; @@ -860,6 +872,132 @@ impl fmt::Write for Formatter { } } +/// A mutable reference to a byte buffer where a string can be written into. +/// +/// The buffer will be automatically null terminated after the last written character. +/// +/// # Invariants +/// +/// * The first byte of `buffer` is always zero. +/// * The length of `buffer` is at least 1. +pub(crate) struct NullTerminatedFormatter<'a> { + buffer: &'a mut [u8], +} + +impl<'a> NullTerminatedFormatter<'a> { + /// Create a new [`Self`] instance. + pub(crate) fn new(buffer: &'a mut [u8]) -> Option<NullTerminatedFormatter<'a>> { + *(buffer.first_mut()?) = 0; + + // INVARIANT: + // - We wrote zero to the first byte above. + // - If buffer was not at least length 1, `buffer.first_mut()` would return None. + Some(Self { buffer }) + } +} + +impl Write for NullTerminatedFormatter<'_> { + fn write_str(&mut self, s: &str) -> fmt::Result { + let bytes = s.as_bytes(); + let len = bytes.len(); + + // We want space for a zero. By type invariant, buffer length is always at least 1, so no + // underflow. + if len > self.buffer.len() - 1 { + return Err(fmt::Error); + } + + let buffer = core::mem::take(&mut self.buffer); + // We break the zero start invariant for a short while. + buffer[..len].copy_from_slice(bytes); + // INVARIANT: We checked above that buffer will have size at least 1 after this assignment. + self.buffer = &mut buffer[len..]; + + // INVARIANT: We write zero to the first byte of the buffer. + self.buffer[0] = 0; + + Ok(()) + } +} + +/// # Safety +/// +/// - `string` must point to a null terminated string that is valid for read. +unsafe fn kstrtobool_raw(string: *const u8) -> Result<bool> { + let mut result: bool = false; + + // SAFETY: + // - By function safety requirement, `string` is a valid null-terminated string. + // - `result` is a valid `bool` that we own. + to_result(unsafe { bindings::kstrtobool(string, &mut result) })?; + Ok(result) +} + +/// Convert common user inputs into boolean values using the kernel's `kstrtobool` function. +/// +/// This routine returns `Ok(bool)` if the first character is one of 'YyTt1NnFf0', or +/// \[oO\]\[NnFf\] for "on" and "off". Otherwise it will return `Err(EINVAL)`. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{c_str, str::kstrtobool}; +/// +/// // Lowercase +/// assert_eq!(kstrtobool(c_str!("true")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("tr")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("t")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("twrong")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("false")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("f")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("yes")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("no")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("on")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("off")), Ok(false)); +/// +/// // Camel case +/// assert_eq!(kstrtobool(c_str!("True")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("False")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("Yes")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("No")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("On")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("Off")), Ok(false)); +/// +/// // All caps +/// assert_eq!(kstrtobool(c_str!("TRUE")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("FALSE")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("YES")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("NO")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("ON")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("OFF")), Ok(false)); +/// +/// // Numeric +/// assert_eq!(kstrtobool(c_str!("1")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("0")), Ok(false)); +/// +/// // Invalid input +/// assert_eq!(kstrtobool(c_str!("invalid")), Err(EINVAL)); +/// assert_eq!(kstrtobool(c_str!("2")), Err(EINVAL)); +/// ``` +pub fn kstrtobool(string: &CStr) -> Result<bool> { + // SAFETY: + // - The pointer returned by `CStr::as_char_ptr` is guaranteed to be + // null terminated. + // - `string` is live and thus the string is valid for read. + unsafe { kstrtobool_raw(string.as_char_ptr()) } +} + +/// Convert `&[u8]` to `bool` by deferring to [`kernel::str::kstrtobool`]. +/// +/// Only considers at most the first two bytes of `bytes`. +pub fn kstrtobool_bytes(bytes: &[u8]) -> Result<bool> { + // `ktostrbool` only considers the first two bytes of the input. + let stack_string = [*bytes.first().unwrap_or(&0), *bytes.get(1).unwrap_or(&0), 0]; + // SAFETY: `stack_string` is null terminated and it is live on the stack so + // it is valid for read. + unsafe { kstrtobool_raw(stack_string.as_ptr()) } +} + /// An owned string that is guaranteed to have exactly one `NUL` byte, which is at the end. /// /// Used for interoperability with kernel APIs that take C strings. diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs index 5005453f874d..0ccc7553ef39 100644 --- a/samples/rust/rust_configfs.rs +++ b/samples/rust/rust_configfs.rs @@ -5,7 +5,7 @@ use kernel::alloc::flags; use kernel::c_str; use kernel::configfs; -use kernel::configfs_attrs; +use kernel::configfs::configfs_attrs; use kernel::new_mutex; use kernel::page::PAGE_SIZE; use kernel::prelude::*; diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 5d7f4ecfb816..770269efe42a 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -20,6 +20,7 @@ TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh +TEST_PROGS += test_generic_13.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index b71faba86c3b..6b8123c12a7a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1384,21 +1384,23 @@ static int cmd_dev_list(struct dev_ctx *ctx) static int cmd_dev_get_features(void) { #define const_ilog2(x) (63 - __builtin_clzll(x)) +#define FEAT_NAME(f) [const_ilog2(f)] = #f static const char *feat_map[] = { - [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", - [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", - [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", - [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", - [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", - [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", - [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", - [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", - [const_ilog2(UBLK_F_ZONED)] = "ZONED", - [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", - [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE", - [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG", - [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE", - [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON", + FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY), + FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK), + FEAT_NAME(UBLK_F_NEED_GET_DATA), + FEAT_NAME(UBLK_F_USER_RECOVERY), + FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE), + FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV), + FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE), + FEAT_NAME(UBLK_F_USER_COPY), + FEAT_NAME(UBLK_F_ZONED), + FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO), + FEAT_NAME(UBLK_F_UPDATE_SIZE), + FEAT_NAME(UBLK_F_AUTO_BUF_REG), + FEAT_NAME(UBLK_F_QUIESCE), + FEAT_NAME(UBLK_F_PER_IO_DAEMON), + FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), }; struct ublk_dev *dev; __u64 features = 0; @@ -1425,7 +1427,7 @@ static int cmd_dev_get_features(void) feat = feat_map[i]; else feat = "unknown"; - printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); + printf("0x%-16llx: %s\n", 1ULL << i, feat); } } diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh index 9227a208ba53..21a31cd5491a 100755 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "sequential io order" dev_id=$(_add_ublk_dev -t null) diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 3e80121e3bf5..12920768b1a0 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "sequential io order for MQ" dev_id=$(_add_ublk_dev -t null -q 2) diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index 7abbb00d251d..b4046201b4d9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "do imbalanced load, it should be balanced over I/O threads" NTHREADS=6 diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh new file mode 100755 index 000000000000..b7aa90b1cb74 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_13" +ERR_CODE=0 + +_prep_test "null" "check that feature list is complete" + +if ${UBLK_PROG} features | grep -q unknown; then + echo "# unknown feature detected!" + echo "# did you add a feature and forget to update feat_map in kublk?" + echo "# this failure is expected if running an older test suite against" + echo "# a newer kernel with new features added" + ERR_CODE=255 +fi + +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index a34203f72668..c2cb8f7a09fe 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -6,6 +6,10 @@ TID="null_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "basic IO test" dev_id=$(_add_ublk_dev -t null) diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 5633ca876655..8accd35beb55 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -6,6 +6,10 @@ TID="null_02" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "basic IO test with zero copy" dev_id=$(_add_ublk_dev -t null -z) diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 566cfd90d192..274295061042 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -5,6 +5,10 @@ TID="stress_05" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + run_io_and_remove() { local size=$1 |