diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Kconfig | 4 | ||||
-rw-r--r-- | drivers/nvme/host/auth.c | 30 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 205 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 13 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 206 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 31 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 300 | ||||
-rw-r--r-- | drivers/nvme/host/sysfs.c | 7 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 138 |
9 files changed, 663 insertions, 271 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 4d64b6935bb9..7dca58f0a237 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -84,9 +84,9 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK + select CRC32 + select NET_CRC32C select NVME_FABRICS - select CRYPTO - select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 6115fef74c1e..f6ddbe553289 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context { u32 s1; u32 s2; bool bi_directional; + bool authenticated; u16 transaction; u8 status; u8 dhgroup_id; @@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { nvme_auth_reset_dhchap(chap); + chap->authenticated = false; if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) @@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + chap->authenticated = true; if (ctrl->opts->concat && (ret = nvme_auth_secure_concat(ctrl, chap))) { dev_warn(ctrl->device, "%s: qid %d failed to enable secure concatenation\n", __func__, chap->qid); chap->error = ret; + chap->authenticated = false; } return; } @@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work) return; for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_negotiate(ctrl, q); - if (ret) { - dev_warn(ctrl->device, - "qid %d: error %d setting up authentication\n", - q, ret); - break; - } + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + /* + * Skip re-authentication if the queue had + * not been authenticated initially. + */ + if (!chap->authenticated) + continue; + cancel_work_sync(&chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); } /* @@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work) * the controller terminates the connection. */ for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_wait(ctrl, q); + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + if (!chap->authenticated) + continue; + flush_work(&chap->auth_work); + ret = chap->error; + nvme_auth_reset_dhchap(chap); if (ret) dev_warn(ctrl->device, "qid %d: authentication failed\n", q); @@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) chap = &ctrl->dhchap_ctxs[i]; chap->qid = i; chap->ctrl = ctrl; + chap->authenticated = false; INIT_WORK(&chap->auth_work, nvme_queue_auth_work); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6b04473c0ab7..f69a232a000a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -38,6 +38,8 @@ struct nvme_ns_info { u32 nsid; __le32 anagrpid; u8 pi_offset; + u16 endgid; + u64 runs; bool is_shared; bool is_readonly; bool is_ready; @@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, struct nvme_command *cmd); +static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, + u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi); void nvme_queue_scan(struct nvme_ctrl *ctrl) { @@ -664,10 +668,11 @@ static void nvme_free_ns_head(struct kref *ref) struct nvme_ns_head *head = container_of(ref, struct nvme_ns_head, ref); - nvme_mpath_remove_disk(head); + nvme_mpath_put_disk(head); ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); + kfree(head->plids); kfree(head); } @@ -991,6 +996,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + if (op == nvme_cmd_write && ns->head->nr_plids) { + u16 write_stream = req->bio->bi_write_stream; + + if (WARN_ON_ONCE(write_stream > ns->head->nr_plids)) + return BLK_STS_INVAL; + + if (write_stream) { + dsmgmt |= ns->head->plids[write_stream - 1] << 16; + control |= NVME_RW_DTYPE_DPLCMT; + } + } + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) return BLK_STS_INVAL; @@ -1157,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL); if (ret) goto out; } @@ -1609,6 +1626,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; info->is_ready = true; + info->endgid = le16_to_cpu(id->endgid); if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { dev_info(ctrl->device, "Ignoring bogus Namespace Identifiers\n"); @@ -1649,6 +1667,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl, info->is_ready = id->nstat & NVME_NSTAT_NRDY; info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL; info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT; + info->endgid = le16_to_cpu(id->endgid); } kfree(id); return ret; @@ -1674,7 +1693,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result) + void *result) { return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, buflen, result); @@ -1683,7 +1702,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features); int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result) + void *result) { return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, buflen, result); @@ -2167,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, return ret; } +static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl, + struct nvme_ns_info *info, u8 fdp_idx) +{ + struct nvme_fdp_config_log hdr, *h; + struct nvme_fdp_config_desc *desc; + size_t size = sizeof(hdr); + void *log, *end; + int i, n, ret; + + ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0, + NVME_CSI_NVM, &hdr, size, 0, info->endgid); + if (ret) { + dev_warn(ctrl->device, + "FDP configs log header status:0x%x endgid:%d\n", ret, + info->endgid); + return ret; + } + + size = le32_to_cpu(hdr.sze); + if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) { + dev_warn(ctrl->device, "FDP config size too large:%zu\n", + size); + return 0; + } + + h = kvmalloc(size, GFP_KERNEL); + if (!h) + return -ENOMEM; + + ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0, + NVME_CSI_NVM, h, size, 0, info->endgid); + if (ret) { + dev_warn(ctrl->device, + "FDP configs log status:0x%x endgid:%d\n", ret, + info->endgid); + goto out; + } + + n = le16_to_cpu(h->numfdpc) + 1; + if (fdp_idx > n) { + dev_warn(ctrl->device, "FDP index:%d out of range:%d\n", + fdp_idx, n); + /* Proceed without registering FDP streams */ + ret = 0; + goto out; + } + + log = h + 1; + desc = log; + end = log + size - sizeof(*h); + for (i = 0; i < fdp_idx; i++) { + log += le16_to_cpu(desc->dsze); + desc = log; + if (log >= end) { + dev_warn(ctrl->device, + "FDP invalid config descriptor list\n"); + ret = 0; + goto out; + } + } + + if (le32_to_cpu(desc->nrg) > 1) { + dev_warn(ctrl->device, "FDP NRG > 1 not supported\n"); + ret = 0; + goto out; + } + + info->runs = le64_to_cpu(desc->runs); +out: + kvfree(h); + return ret; +} + +static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info) +{ + struct nvme_ns_head *head = ns->head; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_fdp_ruh_status *ruhs; + struct nvme_fdp_config fdp; + struct nvme_command c = {}; + size_t size; + int i, ret; + + /* + * The FDP configuration is static for the lifetime of the namespace, + * so return immediately if we've already registered this namespace's + * streams. + */ + if (head->nr_plids) + return 0; + + ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0, + &fdp); + if (ret) { + dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret); + return ret; + } + + if (!(fdp.flags & FDPCFG_FDPE)) + return 0; + + ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx); + if (!info->runs) + return ret; + + size = struct_size(ruhs, ruhsd, S8_MAX - 1); + ruhs = kzalloc(size, GFP_KERNEL); + if (!ruhs) + return -ENOMEM; + + c.imr.opcode = nvme_cmd_io_mgmt_recv; + c.imr.nsid = cpu_to_le32(head->ns_id); + c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS; + c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size)); + ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size); + if (ret) { + dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret); + goto free; + } + + head->nr_plids = le16_to_cpu(ruhs->nruhsd); + if (!head->nr_plids) + goto free; + + head->plids = kcalloc(head->nr_plids, sizeof(*head->plids), + GFP_KERNEL); + if (!head->plids) { + dev_warn(ctrl->device, + "failed to allocate %u FDP placement IDs\n", + head->nr_plids); + head->nr_plids = 0; + ret = -ENOMEM; + goto free; + } + + for (i = 0; i < head->nr_plids; i++) + head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid); +free: + kfree(ruhs); + return ret; +} + static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { @@ -2204,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) { + ret = nvme_query_fdp_info(ns, info); + if (ret < 0) + goto out; + } + lim = queue_limits_start_update(ns->disk->queue); memflags = blk_mq_freeze_queue(ns->disk->queue); @@ -2248,6 +2415,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_init_integrity(ns->head, &lim, info)) capacity = 0; + lim.max_write_streams = ns->head->nr_plids; + if (lim.max_write_streams) + lim.write_stream_granularity = min(info->runs, U32_MAX); + else + lim.write_stream_granularity = 0; + ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue, memflags); @@ -2351,6 +2524,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) ns->head->disk->flags |= GENHD_FL_HIDDEN; else nvme_init_integrity(ns->head, &lim, info); + lim.max_write_streams = ns_lim->max_write_streams; + lim.write_stream_granularity = ns_lim->write_stream_granularity; ret = queue_limits_commit_update(ns->head->disk->queue, &lim); set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); @@ -3108,8 +3283,8 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, - void *log, size_t size, u64 offset) +static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, + u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi) { struct nvme_command c = { }; u32 dwlen = nvme_bytes_to_numd(size); @@ -3123,10 +3298,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); c.get_log_page.csi = csi; + c.get_log_page.lsi = cpu_to_le16(lsi); return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, + void *log, size_t size, u64 offset) +{ + return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size, + offset, 0); +} + static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { @@ -3584,7 +3767,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl, */ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h)) continue; - if (!list_empty(&h->list) && nvme_tryget_ns_head(h)) + if (nvme_tryget_ns_head(h)) return h; } @@ -3828,7 +4011,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) } } else { ret = -EINVAL; - if (!info->is_shared || !head->shared) { + if ((!info->is_shared || !head->shared) && + !list_empty(&head->list)) { dev_err(ctrl->device, "Duplicate unshared namespace %d\n", info->nsid); @@ -4032,7 +4216,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); if (list_empty(&ns->head->list)) { - list_del_init(&ns->head->entry); + if (!nvme_mpath_queue_if_no_path(ns->head)) + list_del_init(&ns->head->entry); last_path = true; } mutex_unlock(&ns->ctrl->subsys->lock); @@ -4053,7 +4238,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->ctrl->srcu); if (last_path) - nvme_mpath_shutdown_disk(ns->head); + nvme_mpath_remove_disk(ns->head); nvme_put_ns(ns); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2257c3c96dd2..fdafa3e9e66f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) } static void -nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop) { - struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; struct nvme_fc_rport *rport = lsop->rport; struct nvme_fc_lport *lport = rport->lport; unsigned long flags; @@ -1434,6 +1433,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) } static void +nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; + + nvme_fc_xmt_ls_rsp_free(lsop); +} + +static void nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) { struct nvme_fc_rport *rport = lsop->rport; @@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) dev_warn(lport->dev, "LLDD rejected LS RSP xmt: LS %d status %d\n", w0->ls_cmd, ret); - nvme_fc_xmt_ls_rsp_done(lsop->lsrsp); + nvme_fc_xmt_ls_rsp_free(lsop); return; } } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index cf0ef4745564..878ea8b1a0ac 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -10,10 +10,61 @@ #include "nvme.h" bool multipath = true; -module_param(multipath, bool, 0444); +static bool multipath_always_on; + +static int multipath_param_set(const char *val, const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + if (multipath_always_on && !*arg) { + pr_err("Can't disable multipath when multipath_always_on is configured.\n"); + *arg = true; + return -EINVAL; + } + + return 0; +} + +static const struct kernel_param_ops multipath_param_ops = { + .set = multipath_param_set, + .get = param_get_bool, +}; + +module_param_cb(multipath, &multipath_param_ops, &multipath, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +static int multipath_always_on_set(const char *val, + const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret < 0) + return ret; + + if (*arg) + multipath = true; + + return 0; +} + +static const struct kernel_param_ops multipath_always_on_ops = { + .set = multipath_always_on_set, + .get = param_get_bool, +}; + +module_param_cb(multipath_always_on, &multipath_always_on_ops, + &multipath_always_on, 0444); +MODULE_PARM_DESC(multipath_always_on, + "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support"); + static const char *nvme_iopolicy_names[] = { [NVME_IOPOLICY_NUMA] = "numa", [NVME_IOPOLICY_RR] = "round-robin", @@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head) break; } } - return false; + + /* + * If "head->delayed_removal_secs" is configured (i.e., non-zero), do + * not immediately fail I/O. Instead, requeue the I/O for the configured + * duration, anticipating that if there's a transient link failure then + * it may recover within this time window. This parameter is exported to + * userspace via sysfs, and its default value is zero. It is internally + * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is + * non-zero, this flag is set to true. When zero, the flag is cleared. + */ + return nvme_mpath_queue_if_no_path(head); } static void nvme_ns_head_submit_bio(struct bio *bio) @@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work) } } +static void nvme_remove_head(struct nvme_ns_head *head) +{ + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared + * to allow multipath to fail all I/O. + */ + kblockd_schedule_work(&head->requeue_work); + + nvme_cdev_del(&head->cdev, &head->cdev_device); + synchronize_srcu(&head->srcu); + del_gendisk(head->disk); + nvme_put_ns_head(head); + } +} + +static void nvme_remove_head_work(struct work_struct *work) +{ + struct nvme_ns_head *head = container_of(to_delayed_work(work), + struct nvme_ns_head, remove_work); + bool remove = false; + + mutex_lock(&head->subsys->lock); + if (list_empty(&head->list)) { + list_del_init(&head->entry); + remove = true; + } + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); + + module_put(THIS_MODULE); +} + int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; @@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work); + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work); + head->delayed_removal_secs = 0; /* - * Add a multipath node if the subsystems supports multiple controllers. - * We also do this for private namespaces as the namespace sharing flag - * could change after a rescan. + * If "multipath_always_on" is enabled, a multipath node is added + * regardless of whether the disk is single/multi ported, and whether + * the namespace is shared or private. If "multipath_always_on" is not + * enabled, a multipath node is added only if the subsystem supports + * multiple controllers and the "multipath" option is configured. In + * either case, for private namespaces, we ensure that the NSID is + * unique. */ - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - !nvme_is_unique_nsid(ctrl, head) || !multipath) + if (!multipath_always_on) { + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + !multipath) + return 0; + } + + if (!nvme_is_unique_nsid(ctrl, head)) return 0; blk_set_stacking_limits(&lim); @@ -660,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state); sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); + nvme_tryget_ns_head(head); return 0; } @@ -1016,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr } DEVICE_ATTR_RO(numa_nodes); +static ssize_t delayed_removal_secs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + int ret; + + mutex_lock(&head->subsys->lock); + ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs); + mutex_unlock(&head->subsys->lock); + return ret; +} + +static ssize_t delayed_removal_secs_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + unsigned int sec; + int ret; + + ret = kstrtouint(buf, 0, &sec); + if (ret < 0) + return ret; + + mutex_lock(&head->subsys->lock); + head->delayed_removal_secs = sec; + if (sec) + set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + else + clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + mutex_unlock(&head->subsys->lock); + /* + * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen + * by its reader. + */ + synchronize_srcu(&head->srcu); + + return count; +} + +DEVICE_ATTR_RW(delayed_removal_secs); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { @@ -1137,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) #endif } -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +void nvme_mpath_remove_disk(struct nvme_ns_head *head) { - if (!head->disk) - return; - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - nvme_cdev_del(&head->cdev, &head->cdev_device); + bool remove = false; + + mutex_lock(&head->subsys->lock); + /* + * We are called when all paths have been removed, and at that point + * head->list is expected to be empty. However, nvme_remove_ns() and + * nvme_init_ns_head() can run concurrently and so if head->delayed_ + * removal_secs is configured, it is possible that by the time we reach + * this point, head->list may no longer be empty. Therefore, we recheck + * head->list here. If it is no longer empty then we skip enqueuing the + * delayed head removal work. + */ + if (!list_empty(&head->list)) + goto out; + + if (head->delayed_removal_secs) { /* - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared - * to allow multipath to fail all I/O. + * Ensure that no one could remove this module while the head + * remove work is pending. */ - synchronize_srcu(&head->srcu); - kblockd_schedule_work(&head->requeue_work); - del_gendisk(head->disk); + if (!try_module_get(THIS_MODULE)) + goto out; + queue_delayed_work(nvme_wq, &head->remove_work, + head->delayed_removal_secs * HZ); + } else { + list_del_init(&head->entry); + remove = true; } +out: + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); } -void nvme_mpath_remove_disk(struct nvme_ns_head *head) +void nvme_mpath_put_disk(struct nvme_ns_head *head) { if (!head->disk) return; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8fc4683418a3..ad0c1f834f09 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -497,6 +497,9 @@ struct nvme_ns_head { struct device cdev_device; struct gendisk *disk; + + u16 nr_plids; + u16 *plids; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; @@ -504,7 +507,10 @@ struct nvme_ns_head { struct work_struct partition_scan_work; struct mutex lock; unsigned long flags; -#define NVME_NSHEAD_DISK_LIVE 0 + struct delayed_work remove_work; + unsigned int delayed_removal_secs; +#define NVME_NSHEAD_DISK_LIVE 0 +#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1 struct nvme_ns __rcu *current_path[]; #endif }; @@ -897,10 +903,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int qid, nvme_submit_flags_t flags); int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result); + void *result); int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result); + void *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); @@ -961,7 +967,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns); void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); -void nvme_mpath_remove_disk(struct nvme_ns_head *head); +void nvme_mpath_put_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_update(struct nvme_ctrl *ctrl); @@ -970,7 +976,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); void nvme_mpath_start_request(struct request *rq); void nvme_mpath_end_request(struct request *rq); @@ -987,12 +993,19 @@ extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute dev_attr_queue_depth; extern struct device_attribute dev_attr_numa_nodes; +extern struct device_attribute dev_attr_delayed_removal_secs; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return disk->fops == &nvme_ns_head_ops; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags)) + return true; + return false; +} #else #define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) @@ -1013,7 +1026,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { } -static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_put_disk(struct nvme_ns_head *head) { } static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns) @@ -1032,7 +1045,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } -static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } static inline void nvme_trace_bio_complete(struct request *req) @@ -1080,6 +1093,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return false; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + return false; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16], diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f1dd804151b1..e0bfe04a2bc2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/nodemask.h> #include <linux/once.h> #include <linux/pci.h> #include <linux/suspend.h> @@ -34,16 +35,31 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* Optimisation for I/Os between 4k and 128k */ +#define NVME_SMALL_POOL_SIZE 256 /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. */ #define NVME_MAX_KB_SZ 8192 -#define NVME_MAX_SEGS 128 -#define NVME_MAX_META_SEGS 15 -#define NVME_MAX_NR_ALLOCATIONS 5 +#define NVME_MAX_NR_DESCRIPTORS 5 + +/* + * For data SGLs we support a single descriptors worth of SGL entries, but for + * now we also limit it to avoid an allocation larger than PAGE_SIZE for the + * scatterlist. + */ +#define NVME_MAX_SEGS \ + min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \ + (PAGE_SIZE / sizeof(struct scatterlist))) + +/* + * For metadata SGLs, only the small descriptor is supported, and the first + * entry is the segment descriptor, which for the data pointer sits in the SQE. + */ +#define NVME_MAX_META_SEGS \ + ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); +struct nvme_descriptor_pools { + struct dma_pool *large; + struct dma_pool *small; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -121,8 +142,6 @@ struct nvme_dev { struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; @@ -162,6 +181,7 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + struct nvme_descriptor_pools descriptor_pools[]; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) */ struct nvme_queue { struct nvme_dev *dev; + struct nvme_descriptor_pools descriptor_pools; spinlock_t sq_lock; void *sq_cmds; /* only used for poll queues: */ @@ -219,30 +240,30 @@ struct nvme_queue { struct completion delete_done; }; -union nvme_descriptor { - struct nvme_sgl_desc *sg_list; - __le64 *prp_list; +/* bits for iod->flags */ +enum nvme_iod_flags { + /* this command has been aborted by the timeout handler */ + IOD_ABORTED = 1U << 0, + + /* uses the small descriptor pool */ + IOD_SMALL_DESCRIPTOR = 1U << 1, }; /* * The nvme_iod describes the data in an I/O. - * - * The sg pointer contains the list of PRP/SGL chunk allocations in addition - * to the actual struct scatterlist. */ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool aborted; - s8 nr_allocations; /* PRP list pool allocations. 0 means small - pool in use */ + u8 flags; + u8 nr_descriptors; unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; struct sg_table meta_sgt; - union nvme_descriptor meta_list; - union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; + struct nvme_sgl_desc *meta_descriptor; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -397,30 +418,78 @@ static __always_inline int nvme_pci_npages_prp(void) return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static struct nvme_descriptor_pools * +nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { - struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[0]; + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; + size_t small_align = NVME_SMALL_POOL_SIZE; - WARN_ON(hctx_idx != 0); - WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + if (pools->small) + return pools; /* already initialized */ - hctx->driver_data = nvmeq; - return 0; + pools->large = dma_pool_create_node("nvme descriptor page", dev->dev, + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node); + if (!pools->large) + return ERR_PTR(-ENOMEM); + + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + + pools->small = dma_pool_create_node("nvme descriptor small", dev->dev, + NVME_SMALL_POOL_SIZE, small_align, 0, numa_node); + if (!pools->small) { + dma_pool_destroy(pools->large); + pools->large = NULL; + return ERR_PTR(-ENOMEM); + } + + return pools; } -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static void nvme_release_descriptor_pools(struct nvme_dev *dev) +{ + unsigned i; + + for (i = 0; i < nr_node_ids; i++) { + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i]; + + dma_pool_destroy(pools->large); + dma_pool_destroy(pools->small); + } +} + +static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, + unsigned qid) { struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_descriptor_pools *pools; + struct blk_mq_tags *tags; + + tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; + WARN_ON(tags != hctx->tags); + pools = nvme_setup_descriptor_pools(dev, hctx->numa_node); + if (IS_ERR(pools)) + return PTR_ERR(pools); - WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + nvmeq->descriptor_pools = *pools; hctx->driver_data = nvmeq; return 0; } +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + WARN_ON(hctx_idx != 0); + return nvme_init_hctx_common(hctx, data, 0); +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + return nvme_init_hctx_common(hctx, data, hctx_idx + 1); +} + static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) @@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) +static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, + struct nvme_iod *iod) +{ + if (iod->flags & IOD_SMALL_DESCRIPTOR) + return nvmeq->descriptor_pools.small; + return nvmeq->descriptor_pools.large; +} + +static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; + if (iod->nr_descriptors == 1) { + dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], + dma_addr); + return; + } + + for (i = 0; i < iod->nr_descriptors; i++) { + __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_pool_free(nvmeq->descriptor_pools.large, prp_list, + dma_addr); dma_addr = next_dma_addr; } } -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, + struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) WARN_ON_ONCE(!iod->sgt.nents); dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - - if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, - iod->first_dma); - else if (iod->nr_allocations == 1) - dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, - iod->first_dma); - else - nvme_free_prps(dev, req); + nvme_free_descriptors(nvmeq, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); } @@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) } } -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sgt.sgl; int dma_len = sg_dma_len(sg); @@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; dma_addr_t prp_dma; - int nprps, i; + int i; length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { @@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, goto done; } - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= + NVME_SMALL_POOL_SIZE / sizeof(__le64)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_allocations = -1; + prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &prp_dma); + if (!prp_list) return BLK_STS_RESOURCE; - } - iod->list[0].prp_list = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + + prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, + GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -673,7 +745,7 @@ done: cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); return BLK_STS_OK; free_prps: - nvme_free_prps(dev, req); + nvme_free_descriptors(nvmeq, req); return BLK_STS_RESOURCE; bad_sgl: WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), @@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; struct nvme_sgl_desc *sg_list; struct scatterlist *sg = iod->sgt.sgl; unsigned int entries = iod->sgt.nents; @@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_OK; } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { - iod->nr_allocations = -1; + sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &sgl_dma); + if (!sg_list) return BLK_STS_RESOURCE; - } - - iod->list[0].sg_list = sg_list; + iod->descriptors[iod->nr_descriptors++] = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); @@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; int rc; if (blk_rq_nr_phys_segments(req) == 1) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { @@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, } if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw); else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw); if (ret != BLK_STS_OK) goto out_unmap_sg; return BLK_STS_OK; @@ -842,6 +906,7 @@ out_free_sg: static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_rw_command *cmnd = &iod->cmd.rw; struct nvme_sgl_desc *sg_list; @@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, if (rc) goto out_free_sg; - sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); + sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, + &sgl_dma); if (!sg_list) goto out_unmap_sg; entries = iod->meta_sgt.nents; - iod->meta_list.sg_list = sg_list; + iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; cmnd->flags = NVME_CMD_SGL_METASEG; @@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) { - if (nvme_pci_metadata_use_sgls(dev, req)) + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && + nvme_pci_metadata_use_sgls(dev, req)) return nvme_pci_setup_meta_sgls(dev, req); return nvme_pci_setup_meta_mptr(dev, req); } @@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = false; - iod->nr_allocations = -1; + iod->flags = 0; + iod->nr_descriptors = 0; iod->sgt.nents = 0; iod->meta_sgt.nents = 0; @@ -947,7 +1016,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; out_unmap_data: if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, req->mq_hctx->driver_data, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; @@ -1037,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist) } static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, + struct nvme_queue *nvmeq, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1048,8 +1118,8 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, return; } - dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, - iod->meta_dma); + dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, + iod->meta_dma); dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); } @@ -1060,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_dev *dev = nvmeq->dev; if (blk_integrity_rq(req)) - nvme_unmap_metadata(dev, req); + nvme_unmap_metadata(dev, nvmeq, req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, nvmeq, req); } static void nvme_pci_complete_rq(struct request *req) @@ -1490,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * returned to the driver, or if this is the admin queue. */ opcode = nvme_req(req)->cmd->common.opcode; - if (!nvmeq->qid || iod->aborted) { + if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) { dev_warn(dev->ctrl.device, "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", req->tag, nvme_cid(req), opcode, @@ -1503,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = true; + iod->flags |= IOD_ABORTED; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); @@ -2842,35 +2912,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) return 0; } -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - size_t small_align = 256; - - dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) - small_align = 512; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, small_align, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); @@ -3185,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids), + GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); @@ -3260,13 +3302,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_uninit_ctrl; - result = nvme_setup_prp_pools(dev); - if (result) - goto out_dev_unmap; - result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto out_release_prp_pools; + goto out_dev_unmap; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); @@ -3342,8 +3380,6 @@ out_disable: out_release_iod_mempool: mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); -out_release_prp_pools: - nvme_release_prp_pools(dev); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3408,7 +3444,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); - nvme_release_prp_pools(dev); + nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); } @@ -3809,9 +3845,7 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); - BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); + BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index a5bc3bb483d5..29430949ce2f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = { &dev_attr_ana_state.attr, &dev_attr_queue_depth.attr, &dev_attr_numa_nodes.attr, + &dev_attr_delayed_removal_secs.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, NULL, @@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } + if (a == &dev_attr_delayed_removal_secs.attr) { + struct gendisk *disk = dev_to_disk(dev); + + if (!nvme_disk_is_ns_head(disk)) + return 0; + } #endif return a->mode; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index aba365f97cf6..f6379aa33d77 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/crc32.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> @@ -16,7 +17,6 @@ #include <net/tls_prot.h> #include <net/handshake.h> #include <linux/blk-mq.h> -#include <crypto/hash.h> #include <net/busy_poll.h> #include <trace/events/sock.h> @@ -168,8 +168,8 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; bool tls_enabled; - struct ahash_request *rcv_hash; - struct ahash_request *snd_hash; + u32 rcv_crc; + u32 snd_crc; __le32 exp_ddgst; __le32 recv_ddgst; struct completion tls_complete; @@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync, bool last) + bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, * are on the same cpu, so we don't introduce contention. */ if (queue->io_cpu == raw_smp_processor_id() && - sync && empty && mutex_trylock(&queue->send_mutex)) { + empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_send_all(queue); mutex_unlock(&queue->send_mutex); } @@ -456,32 +456,38 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) return req; } -static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, - __le32 *dgst) +#define NVME_TCP_CRC_SEED (~0) + +static inline void nvme_tcp_ddgst_update(u32 *crcp, + struct page *page, size_t off, size_t len) { - ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); - crypto_ahash_final(hash); + page += off / PAGE_SIZE; + off %= PAGE_SIZE; + while (len) { + const void *vaddr = kmap_local_page(page); + size_t n = min(len, (size_t)PAGE_SIZE - off); + + *crcp = crc32c(*crcp, vaddr + off, n); + kunmap_local(vaddr); + page++; + off = 0; + len -= n; + } } -static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, - struct page *page, off_t off, size_t len) +static inline __le32 nvme_tcp_ddgst_final(u32 crc) { - struct scatterlist sg; - - sg_init_table(&sg, 1); - sg_set_page(&sg, page, len, off); - ahash_request_set_crypt(hash, &sg, NULL, len); - crypto_ahash_update(hash); + return cpu_to_le32(~crc); } -static inline void nvme_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline __le32 nvme_tcp_hdgst(const void *pdu, size_t len) { - struct scatterlist sg; + return cpu_to_le32(~crc32c(NVME_TCP_CRC_SEED, pdu, len)); +} - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); +static inline void nvme_tcp_set_hdgst(void *pdu, size_t len) +{ + *(__le32 *)(pdu + len) = nvme_tcp_hdgst(pdu, len); } static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, @@ -499,8 +505,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); - exp_digest = *(__le32 *)(pdu + hdr->hlen); + exp_digest = nvme_tcp_hdgst(pdu, pdu_len); if (recv_digest != exp_digest) { dev_err(queue->ctrl->ctrl.device, "header digest error: recv %#x expected %#x\n", @@ -526,7 +531,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) nvme_tcp_queue_id(queue)); return -EPROTO; } - crypto_ahash_init(queue->rcv_hash); + queue->rcv_crc = NVME_TCP_CRC_SEED; return 0; } @@ -770,7 +775,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->ttag = pdu->ttag; nvme_tcp_setup_h2c_data_pdu(req); - nvme_tcp_queue_request(req, false, true); + + llist_add(&req->lentry, &queue->req_list); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); return 0; } @@ -926,8 +933,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, iov_iter_count(&req->iter)); if (queue->data_digest) - ret = skb_copy_and_hash_datagram_iter(skb, *offset, - &req->iter, recv_len, queue->rcv_hash); + ret = skb_copy_and_crc32c_datagram_iter(skb, *offset, + &req->iter, recv_len, &queue->rcv_crc); else ret = skb_copy_datagram_iter(skb, *offset, &req->iter, recv_len); @@ -945,7 +952,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, if (!queue->data_remaining) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->exp_ddgst = nvme_tcp_ddgst_final(queue->rcv_crc); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { @@ -1147,7 +1154,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) return ret; if (queue->data_digest) - nvme_tcp_ddgst_update(queue->snd_hash, page, + nvme_tcp_ddgst_update(&queue->snd_crc, page, offset, ret); /* @@ -1161,8 +1168,8 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->snd_hash, - &req->ddgst); + req->ddgst = + nvme_tcp_ddgst_final(queue->snd_crc); req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { @@ -1194,7 +1201,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) msg.msg_flags |= MSG_EOR; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); bvec_set_virt(&bvec, (void *)pdu + req->offset, len); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); @@ -1207,7 +1214,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) if (inline_data) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; } else { nvme_tcp_done_send_req(queue); } @@ -1229,7 +1236,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) int ret; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); if (!req->h2cdata_left) msg.msg_flags |= MSG_SPLICE_PAGES; @@ -1244,7 +1251,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (!len) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; return 1; } req->offset += ret; @@ -1384,41 +1391,6 @@ static void nvme_tcp_io_work(struct work_struct *w) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } -static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); - - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); - - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; -} - static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) { struct nvme_tcp_request *async = &ctrl->async_req; @@ -1451,9 +1423,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); - page_frag_cache_drain(&queue->pf_cache); noreclaim_flag = memalloc_noreclaim_save(); @@ -1867,21 +1836,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, queue->hdr_digest = nctrl->opts->hdr_digest; queue->data_digest = nctrl->opts->data_digest; - if (queue->hdr_digest || queue->data_digest) { - ret = nvme_tcp_alloc_crypto(queue); - if (ret) { - dev_err(nctrl->device, - "failed to allocate queue %d crypto\n", qid); - goto err_sock; - } - } rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + nvme_tcp_hdgst_len(queue); queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); if (!queue->pdu) { ret = -ENOMEM; - goto err_crypto; + goto err_sock; } dev_dbg(nctrl->device, "connecting queue %d\n", @@ -1914,9 +1875,6 @@ err_init_connect: kernel_sock_shutdown(queue->sock, SHUT_RDWR); err_rcv_pdu: kfree(queue->pdu); -err_crypto: - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); err_sock: /* ->sock will be released by fput() */ fput(queue->sock->file); @@ -2385,7 +2343,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; - if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) { + if (ctrl->opts->concat && !ctrl->tls_pskid) { /* See comments for nvme_tcp_key_revoke_needed() */ dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); nvme_stop_keep_alive(ctrl); @@ -2637,7 +2595,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true, true); + nvme_tcp_queue_request(&ctrl->async_req, true); } static void nvme_tcp_complete_timed_out(struct request *rq) @@ -2789,7 +2747,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(rq); - nvme_tcp_queue_request(req, true, bd->last); + nvme_tcp_queue_request(req, bd->last); return BLK_STS_OK; } |