diff options
Diffstat (limited to 'drivers/nvme/host/multipath.c')
-rw-r--r-- | drivers/nvme/host/multipath.c | 209 |
1 files changed, 190 insertions, 19 deletions
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 250f3da67cc9..878ea8b1a0ac 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -10,10 +10,61 @@ #include "nvme.h" bool multipath = true; -module_param(multipath, bool, 0444); +static bool multipath_always_on; + +static int multipath_param_set(const char *val, const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + if (multipath_always_on && !*arg) { + pr_err("Can't disable multipath when multipath_always_on is configured.\n"); + *arg = true; + return -EINVAL; + } + + return 0; +} + +static const struct kernel_param_ops multipath_param_ops = { + .set = multipath_param_set, + .get = param_get_bool, +}; + +module_param_cb(multipath, &multipath_param_ops, &multipath, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +static int multipath_always_on_set(const char *val, + const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret < 0) + return ret; + + if (*arg) + multipath = true; + + return 0; +} + +static const struct kernel_param_ops multipath_always_on_ops = { + .set = multipath_always_on_set, + .get = param_get_bool, +}; + +module_param_cb(multipath_always_on, &multipath_always_on_ops, + &multipath_always_on, 0444); +MODULE_PARM_DESC(multipath_always_on, + "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support"); + static const char *nvme_iopolicy_names[] = { [NVME_IOPOLICY_NUMA] = "numa", [NVME_IOPOLICY_RR] = "round-robin", @@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head) break; } } - return false; + + /* + * If "head->delayed_removal_secs" is configured (i.e., non-zero), do + * not immediately fail I/O. Instead, requeue the I/O for the configured + * duration, anticipating that if there's a transient link failure then + * it may recover within this time window. This parameter is exported to + * userspace via sysfs, and its default value is zero. It is internally + * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is + * non-zero, this flag is set to true. When zero, the flag is cleared. + */ + return nvme_mpath_queue_if_no_path(head); } static void nvme_ns_head_submit_bio(struct bio *bio) @@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work) } } +static void nvme_remove_head(struct nvme_ns_head *head) +{ + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared + * to allow multipath to fail all I/O. + */ + kblockd_schedule_work(&head->requeue_work); + + nvme_cdev_del(&head->cdev, &head->cdev_device); + synchronize_srcu(&head->srcu); + del_gendisk(head->disk); + nvme_put_ns_head(head); + } +} + +static void nvme_remove_head_work(struct work_struct *work) +{ + struct nvme_ns_head *head = container_of(to_delayed_work(work), + struct nvme_ns_head, remove_work); + bool remove = false; + + mutex_lock(&head->subsys->lock); + if (list_empty(&head->list)) { + list_del_init(&head->entry); + remove = true; + } + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); + + module_put(THIS_MODULE); +} + int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; @@ -626,19 +721,31 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work); + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work); + head->delayed_removal_secs = 0; /* - * Add a multipath node if the subsystems supports multiple controllers. - * We also do this for private namespaces as the namespace sharing flag - * could change after a rescan. + * If "multipath_always_on" is enabled, a multipath node is added + * regardless of whether the disk is single/multi ported, and whether + * the namespace is shared or private. If "multipath_always_on" is not + * enabled, a multipath node is added only if the subsystem supports + * multiple controllers and the "multipath" option is configured. In + * either case, for private namespaces, we ensure that the NSID is + * unique. */ - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - !nvme_is_unique_nsid(ctrl, head) || !multipath) + if (!multipath_always_on) { + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + !multipath) + return 0; + } + + if (!nvme_is_unique_nsid(ctrl, head)) return 0; blk_set_stacking_limits(&lim); lim.dma_alignment = 3; - lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | + BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES; if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; @@ -659,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state); sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); + nvme_tryget_ns_head(head); return 0; } @@ -1015,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr } DEVICE_ATTR_RO(numa_nodes); +static ssize_t delayed_removal_secs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + int ret; + + mutex_lock(&head->subsys->lock); + ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs); + mutex_unlock(&head->subsys->lock); + return ret; +} + +static ssize_t delayed_removal_secs_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + unsigned int sec; + int ret; + + ret = kstrtouint(buf, 0, &sec); + if (ret < 0) + return ret; + + mutex_lock(&head->subsys->lock); + head->delayed_removal_secs = sec; + if (sec) + set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + else + clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + mutex_unlock(&head->subsys->lock); + /* + * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen + * by its reader. + */ + synchronize_srcu(&head->srcu); + + return count; +} + +DEVICE_ATTR_RW(delayed_removal_secs); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { @@ -1136,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) #endif } -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +void nvme_mpath_remove_disk(struct nvme_ns_head *head) { - if (!head->disk) - return; - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - nvme_cdev_del(&head->cdev, &head->cdev_device); + bool remove = false; + + mutex_lock(&head->subsys->lock); + /* + * We are called when all paths have been removed, and at that point + * head->list is expected to be empty. However, nvme_remove_ns() and + * nvme_init_ns_head() can run concurrently and so if head->delayed_ + * removal_secs is configured, it is possible that by the time we reach + * this point, head->list may no longer be empty. Therefore, we recheck + * head->list here. If it is no longer empty then we skip enqueuing the + * delayed head removal work. + */ + if (!list_empty(&head->list)) + goto out; + + if (head->delayed_removal_secs) { /* - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared - * to allow multipath to fail all I/O. + * Ensure that no one could remove this module while the head + * remove work is pending. */ - synchronize_srcu(&head->srcu); - kblockd_schedule_work(&head->requeue_work); - del_gendisk(head->disk); + if (!try_module_get(THIS_MODULE)) + goto out; + queue_delayed_work(nvme_wq, &head->remove_work, + head->delayed_removal_secs * HZ); + } else { + list_del_init(&head->entry); + remove = true; } +out: + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); } -void nvme_mpath_remove_disk(struct nvme_ns_head *head) +void nvme_mpath_put_disk(struct nvme_ns_head *head) { if (!head->disk) return; |