summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/blk-map.c91
-rw-r--r--block/blk-merge.c8
-rw-r--r--drivers/nvme/host/core.c111
-rw-r--r--drivers/nvme/host/nvme.h8
-rw-r--r--drivers/nvme/host/pci.c149
-rw-r--r--fs/fs-writeback.c54
-rw-r--r--fs/super.c1
-rw-r--r--include/linux/bio.h37
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/writeback.h5
10 files changed, 341 insertions, 148 deletions
diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11f465a..a54f0543b956 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
return ret;
}
+static int __blk_rq_map_user_iov(struct request *rq,
+ struct rq_map_data *map_data, struct iov_iter *iter,
+ gfp_t gfp_mask, bool copy)
+{
+ struct request_queue *q = rq->q;
+ struct bio *bio, *orig_bio;
+ int ret;
+
+ if (copy)
+ bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+ else
+ bio = bio_map_user_iov(q, iter, gfp_mask);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ if (map_data && map_data->null_mapped)
+ bio_set_flag(bio, BIO_NULL_MAPPED);
+
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ if (map_data)
+ map_data->offset += bio->bi_iter.bi_size;
+
+ orig_bio = bio;
+ blk_queue_bounce(q, &bio);
+
+ /*
+ * We link the bounce buffer in and could have to traverse it
+ * later so we have to get a ref to prevent it from being freed
+ */
+ bio_get(bio);
+
+ ret = blk_rq_append_bio(q, rq, bio);
+ if (ret) {
+ bio_endio(bio);
+ __blk_rq_unmap_user(orig_bio);
+ bio_put(bio);
+ return ret;
+ }
+
+ return 0;
+}
+
/**
* blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- struct bio *bio;
- int unaligned = 0;
- struct iov_iter i;
struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+ bool copy = (q->dma_pad_mask & iter->count) || map_data;
+ struct bio *bio = NULL;
+ struct iov_iter i;
+ int ret;
if (!iter || !iter->count)
return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
*/
if ((uaddr & queue_dma_alignment(q)) ||
iovec_gap_to_prv(q, &prv, &iov))
- unaligned = 1;
+ copy = true;
prv.iov_base = iov.iov_base;
prv.iov_len = iov.iov_len;
}
- if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
- bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
- else
- bio = bio_map_user_iov(q, iter, gfp_mask);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
-
- if (bio->bi_iter.bi_size != iter->count) {
- /*
- * Grab an extra reference to this bio, as bio_unmap_user()
- * expects to be able to drop it twice as it happens on the
- * normal IO completion path
- */
- bio_get(bio);
- bio_endio(bio);
- __blk_rq_unmap_user(bio);
- return -EINVAL;
- }
+ i = *iter;
+ do {
+ ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+ if (ret)
+ goto unmap_rq;
+ if (!bio)
+ bio = rq->bio;
+ } while (iov_iter_count(&i));
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->cmd_flags |= REQ_COPY_USER;
-
- blk_queue_bounce(q, &bio);
- bio_get(bio);
- blk_rq_bio_prep(q, rq, bio);
return 0;
+
+unmap_rq:
+ __blk_rq_unmap_user(bio);
+ rq->bio = NULL;
+ return -EINVAL;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 888a7fec81f7..261353166dcf 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -304,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
struct bio *nxt)
{
struct bio_vec end_bv = { NULL }, nxt_bv;
- struct bvec_iter iter;
if (!blk_queue_cluster(q))
return 0;
@@ -316,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
if (!bio_has_data(bio))
return 1;
- bio_for_each_segment(end_bv, bio, iter)
- if (end_bv.bv_len == iter.bi_size)
- break;
-
- nxt_bv = bio_iovec(nxt);
+ bio_get_last_bvec(bio, &end_bv);
+ bio_get_first_bvec(nxt, &nxt_bv);
if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
return 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cd921e6121e..03c46412fff4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref)
ns->disk->private_data = NULL;
spin_unlock(&dev_list_lock);
- nvme_put_ctrl(ns->ctrl);
put_disk(ns->disk);
+ ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+ nvme_put_ctrl(ns->ctrl);
kfree(ns);
}
@@ -183,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
goto out_unmap;
}
- if (meta_buffer) {
+ if (meta_buffer && meta_len) {
struct bio_integrity_payload *bip;
meta = kmalloc(meta_len, GFP_KERNEL);
@@ -373,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
if (copy_from_user(&io, uio, sizeof(io)))
return -EFAULT;
+ if (io.flags)
+ return -EINVAL;
switch (io.opcode) {
case nvme_cmd_write:
@@ -424,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
+ if (cmd.flags)
+ return -EINVAL;
memset(&c, 0, sizeof(c));
c.common.opcode = cmd.opcode;
@@ -556,6 +561,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
u16 old_ms;
unsigned short bs;
+ if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+ set_capacity(disk, 0);
+ return -ENODEV;
+ }
if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
__func__, ns->ctrl->instance, ns->ns_id);
@@ -831,6 +840,23 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
return ret;
}
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+ struct request_queue *q)
+{
+ if (ctrl->max_hw_sectors) {
+ u32 max_segments =
+ (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
+ blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+ blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+ }
+ if (ctrl->stripe_size)
+ blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
+ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ blk_queue_virt_boundary(q, ctrl->page_size - 1);
+}
+
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
@@ -888,6 +914,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
}
+ nvme_set_queue_limits(ctrl, ctrl->admin_q);
+
kfree(id);
return 0;
}
@@ -1118,9 +1146,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (!ns)
return;
+ ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
+ if (ns->instance < 0)
+ goto out_free_ns;
+
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
- goto out_free_ns;
+ goto out_release_instance;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
@@ -1134,17 +1166,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns->disk = disk;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- if (ctrl->max_hw_sectors) {
- blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
- blk_queue_max_segments(ns->queue,
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
- }
- if (ctrl->stripe_size)
- blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
- blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+ nvme_set_queue_limits(ctrl, ns->queue);
disk->major = nvme_major;
disk->first_minor = 0;
@@ -1153,7 +1177,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
disk->queue = ns->queue;
disk->driverfs_dev = ctrl->device;
disk->flags = GENHD_FL_EXT_DEVT;
- sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+ sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
if (nvme_revalidate_disk(ns->disk))
goto out_free_disk;
@@ -1173,40 +1197,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
kfree(disk);
out_free_queue:
blk_cleanup_queue(ns->queue);
+ out_release_instance:
+ ida_simple_remove(&ctrl->ns_ida, ns->instance);
out_free_ns:
kfree(ns);
}
static void nvme_ns_remove(struct nvme_ns *ns)
{
- bool kill = nvme_io_incapable(ns->ctrl) &&
- !blk_queue_dying(ns->queue);
-
- lockdep_assert_held(&ns->ctrl->namespaces_mutex);
-
- if (kill) {
- blk_set_queue_dying(ns->queue);
+ if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+ return;
- /*
- * The controller was shutdown first if we got here through
- * device removal. The shutdown may requeue outstanding
- * requests. These need to be aborted immediately so
- * del_gendisk doesn't block indefinitely for their completion.
- */
- blk_mq_abort_requeue_list(ns->queue);
- }
if (ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_attr_group);
del_gendisk(ns->disk);
- }
- if (kill || !blk_queue_dying(ns->queue)) {
blk_mq_abort_requeue_list(ns->queue);
blk_cleanup_queue(ns->queue);
}
+ mutex_lock(&ns->ctrl->namespaces_mutex);
list_del_init(&ns->list);
+ mutex_unlock(&ns->ctrl->namespaces_mutex);
nvme_put_ns(ns);
}
@@ -1300,10 +1313,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns, *next;
- mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
nvme_ns_remove(ns);
- mutex_unlock(&ctrl->namespaces_mutex);
}
static DEFINE_IDA(nvme_instance_ida);
@@ -1350,6 +1361,7 @@ static void nvme_free_ctrl(struct kref *kref)
put_device(ctrl->device);
nvme_release_instance(ctrl);
+ ida_destroy(&ctrl->ns_ida);
ctrl->ops->free_ctrl(ctrl);
}
@@ -1390,6 +1402,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
}
get_device(ctrl->device);
dev_set_drvdata(ctrl->device, ctrl);
+ ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
list_add_tail(&ctrl->node, &nvme_ctrl_list);
@@ -1402,6 +1415,38 @@ out:
return ret;
}
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (!kref_get_unless_zero(&ns->kref))
+ continue;
+
+ /*
+ * Revalidating a dead namespace sets capacity to 0. This will
+ * end buffered writers dirtying pages that can't be synced.
+ */
+ if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ revalidate_disk(ns->disk);
+
+ blk_set_queue_dying(ns->queue);
+ blk_mq_abort_requeue_list(ns->queue);
+ blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+ nvme_put_ns(ns);
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9664d07d807d..fb15ba5f5d19 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -72,6 +72,7 @@ struct nvme_ctrl {
struct mutex namespaces_mutex;
struct device *device; /* char device */
struct list_head node;
+ struct ida ns_ida;
char name[12];
char serial[20];
@@ -102,6 +103,7 @@ struct nvme_ns {
struct request_queue *queue;
struct gendisk *disk;
struct kref kref;
+ int instance;
u8 eui[8];
u8 uuid[16];
@@ -112,6 +114,11 @@ struct nvme_ns {
bool ext;
u8 pi_type;
int type;
+ unsigned long flags;
+
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD 1
+
u64 mode_select_num_blocks;
u32 mode_select_block_len;
};
@@ -240,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a128672472ec..680f5780750c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,6 @@ struct nvme_queue;
static int nvme_reset(struct nvme_dev *dev);
static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
/*
@@ -120,6 +119,7 @@ struct nvme_dev {
unsigned long flags;
#define NVME_CTRL_RESETTING 0
+#define NVME_CTRL_REMOVING 1
struct nvme_ctrl ctrl;
struct completion ioq_wait;
@@ -286,6 +286,17 @@ static int nvme_init_request(void *data, struct request *req,
return 0;
}
+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+ /*
+ * Do not queue new scan work when a controller is reset during
+ * removal.
+ */
+ if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+ return;
+ queue_work(nvme_workq, &dev->scan_work);
+}
+
static void nvme_complete_async_event(struct nvme_dev *dev,
struct nvme_completion *cqe)
{
@@ -300,7 +311,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
switch (result & 0xff07) {
case NVME_AER_NOTICE_NS_CHANGED:
dev_info(dev->dev, "rescanning\n");
- queue_work(nvme_workq, &dev->scan_work);
+ nvme_queue_scan(dev);
default:
dev_warn(dev->dev, "async event result %08x\n", result);
}
@@ -679,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
spin_lock_irq(&nvmeq->q_lock);
if (unlikely(nvmeq->cq_vector < 0)) {
- ret = BLK_MQ_RQ_QUEUE_BUSY;
+ if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+ ret = BLK_MQ_RQ_QUEUE_BUSY;
+ else
+ ret = BLK_MQ_RQ_QUEUE_ERROR;
spin_unlock_irq(&nvmeq->q_lock);
goto out;
}
@@ -1250,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+ /*
+ * If the controller was reset during removal, it's possible
+ * user requests may be waiting on a stopped queue. Start the
+ * queue to flush these to completion.
+ */
+ blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
blk_cleanup_queue(dev->ctrl.admin_q);
blk_mq_free_tag_set(&dev->admin_tagset);
}
@@ -1690,14 +1710,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
return 0;
dev->ctrl.tagset = &dev->tagset;
}
- queue_work(nvme_workq, &dev->scan_work);
+ nvme_queue_scan(dev);
return 0;
}
-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
{
u64 cap;
- int bars, result = -ENOMEM;
+ int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_enable_device_mem(pdev))
@@ -1705,24 +1725,14 @@ static int nvme_dev_map(struct nvme_dev *dev)
dev->entry[0].vector = pdev->irq;
pci_set_master(pdev);
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
- if (!bars)
- goto disable_pci;
-
- if (pci_request_selected_regions(pdev, bars, "nvme"))
- goto disable_pci;
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
- dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
- if (!dev->bar)
- goto disable;
-
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
- goto unmap;
+ goto disable;
}
/*
@@ -1732,7 +1742,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (!pdev->irq) {
result = pci_enable_msix(pdev, dev->entry, 1);
if (result < 0)
- goto unmap;
+ goto disable;
}
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,18 +1769,20 @@ static int nvme_dev_map(struct nvme_dev *dev)
pci_save_state(pdev);
return 0;
- unmap:
- iounmap(dev->bar);
- dev->bar = NULL;
disable:
- pci_release_regions(pdev);
- disable_pci:
pci_disable_device(pdev);
return result;
}
static void nvme_dev_unmap(struct nvme_dev *dev)
{
+ if (dev->bar)
+ iounmap(dev->bar);
+ pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pdev->msi_enabled)
@@ -1778,12 +1790,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
else if (pdev->msix_enabled)
pci_disable_msix(pdev);
- if (dev->bar) {
- iounmap(dev->bar);
- dev->bar = NULL;
- pci_release_regions(pdev);
- }
-
if (pci_is_enabled(pdev)) {
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
@@ -1842,7 +1848,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_dev_list_remove(dev);
mutex_lock(&dev->shutdown_lock);
- if (dev->bar) {
+ if (pci_is_enabled(to_pci_dev(dev->dev))) {
nvme_stop_queues(&dev->ctrl);
csts = readl(dev->bar + NVME_REG_CSTS);
}
@@ -1855,7 +1861,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
}
- nvme_dev_unmap(dev);
+ nvme_pci_disable(dev);
for (i = dev->queue_count - 1; i >= 0; i--)
nvme_clear_queue(dev->queues[i]);
@@ -1899,10 +1905,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
kfree(dev);
}
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+ dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
+
+ kref_get(&dev->ctrl.kref);
+ nvme_dev_disable(dev, false);
+ if (!schedule_work(&dev->remove_work))
+ nvme_put_ctrl(&dev->ctrl);
+}
+
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
- int result;
+ int result = -ENODEV;
if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
goto out;
@@ -1911,37 +1927,37 @@ static void nvme_reset_work(struct work_struct *work)
* If we're called to reset a live controller first shut it down before
* moving on.
*/
- if (dev->bar)
+ if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
set_bit(NVME_CTRL_RESETTING, &dev->flags);
- result = nvme_dev_map(dev);
+ result = nvme_pci_enable(dev);
if (result)
goto out;
result = nvme_configure_admin_queue(dev);
if (result)
- goto unmap;
+ goto out;
nvme_init_queue(dev->queues[0], 0);
result = nvme_alloc_admin_tags(dev);
if (result)
- goto disable;
+ goto out;
result = nvme_init_identify(&dev->ctrl);
if (result)
- goto free_tags;
+ goto out;
result = nvme_setup_io_queues(dev);
if (result)
- goto free_tags;
+ goto out;
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
result = nvme_dev_list_add(dev);
if (result)
- goto remove;
+ goto out;
/*
* Keep the controller around but remove all namespaces if we don't have
@@ -1958,19 +1974,8 @@ static void nvme_reset_work(struct work_struct *work)
clear_bit(NVME_CTRL_RESETTING, &dev->flags);
return;
- remove:
- nvme_dev_list_remove(dev);
- free_tags:
- nvme_dev_remove_admin(dev);
- blk_put_queue(dev->ctrl.admin_q);
- dev->ctrl.admin_q = NULL;
- dev->queues[0]->tags = NULL;
- disable:
- nvme_disable_admin_queue(dev, false);
- unmap:
- nvme_dev_unmap(dev);
out:
- nvme_remove_dead_ctrl(dev);
+ nvme_remove_dead_ctrl(dev, result);
}
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1978,19 +1983,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ nvme_kill_queues(&dev->ctrl);
if (pci_get_drvdata(pdev))
pci_stop_and_remove_bus_device_locked(pdev);
nvme_put_ctrl(&dev->ctrl);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
- dev_warn(dev->dev, "Removing after probe failure\n");
- kref_get(&dev->ctrl.kref);
- if (!schedule_work(&dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static int nvme_reset(struct nvme_dev *dev)
{
if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2042,6 +2040,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.free_ctrl = nvme_pci_free_ctrl,
};
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+ int bars;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ bars = pci_select_bars(pdev, IORESOURCE_MEM);
+ if (!bars)
+ return -ENODEV;
+ if (pci_request_selected_regions(pdev, bars, "nvme"))
+ return -ENODEV;
+
+ dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+ if (!dev->bar)
+ goto release;
+
+ return 0;
+ release:
+ pci_release_regions(pdev);
+ return -ENODEV;
+}
+
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -2066,6 +2085,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
+ result = nvme_dev_map(dev);
+ if (result)
+ goto free;
+
INIT_LIST_HEAD(&dev->node);
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2089,6 +2112,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_release_prp_pools(dev);
put_pci:
put_device(dev->dev);
+ nvme_dev_unmap(dev);
free:
kfree(dev->queues);
kfree(dev->entry);
@@ -2112,10 +2136,16 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_dev_disable(dev, true);
}
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
static void nvme_remove(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
+ set_bit(NVME_CTRL_REMOVING, &dev->flags);
pci_set_drvdata(pdev, NULL);
flush_work(&dev->scan_work);
nvme_remove_namespaces(&dev->ctrl);
@@ -2126,6 +2156,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_queues(dev, 0);
nvme_release_cmb(dev);
nvme_release_prp_pools(dev);
+ nvme_dev_unmap(dev);
nvme_put_ctrl(&dev->ctrl);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f76d8950a57..5c46ed9f3e14 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
- struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
wb_put(new_wb);
iput(inode);
- deactivate_super(sb);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
-
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb)
- goto out_unlock;
-
- if (!atomic_inc_not_zero(&inode->i_sb->s_active))
- goto out_unlock;
-
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
inode->i_state |= I_WB_SWITCH;
spin_unlock(&inode->i_lock);
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
return;
-out_unlock:
- spin_unlock(&inode->i_lock);
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
diff --git a/fs/super.c b/fs/super.c
index 1182af8fd5ff..74914b1bae70 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5349e6816cbb..cb6888824108 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
bio->bi_flags &= ~(1U << bit);
}
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ *bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ int idx;
+
+ if (!bio_flagged(bio, BIO_CLONED)) {
+ *bv = bio->bi_io_vec[bio->bi_vcnt - 1];
+ return;
+ }
+
+ if (unlikely(!bio_multiple_segments(bio))) {
+ *bv = bio_iovec(bio);
+ return;
+ }
+
+ bio_advance_iter(bio, &iter, iter.bi_size);
+
+ if (!iter.bi_bvec_done)
+ idx = iter.bi_idx - 1;
+ else /* in the middle of bvec */
+ idx = iter.bi_idx;
+
+ *bv = bio->bi_io_vec[idx];
+
+ /*
+ * iter.bi_bvec_done records actual length of the last bvec
+ * if this bio ends in the middle of one io vector
+ */
+ if (iter.bi_bvec_done)
+ bv->bv_len = iter.bi_bvec_done;
+}
+
enum bip_flags {
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4571ef1a12a9..413c84fbc4ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
{
struct request_queue *q = rq->q;
- if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+ if (unlikely(rq->cmd_type != REQ_TYPE_FS))
return q->limits.max_hw_sectors;
if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
@@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p)
page_cache_release(p.v);
}
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+ struct bio_vec *bprv, unsigned int offset)
+{
+ return offset ||
+ ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
@@ -1381,18 +1388,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
{
if (!queue_virt_boundary(q))
return false;
- return offset ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return __bvec_gap_to_prev(q, bprv, offset);
}
static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
struct bio *next)
{
- if (!bio_has_data(prev))
- return false;
+ if (bio_has_data(prev) && queue_virt_boundary(q)) {
+ struct bio_vec pb, nb;
+
+ bio_get_last_bvec(prev, &pb);
+ bio_get_first_bvec(next, &nb);
- return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
- next->bi_io_vec[0].bv_offset);
+ return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+ }
+
+ return false;
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b333c945e571..d0b5ca5d4e08 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_io(struct writeback_control *wbc, struct page *page,
size_t bytes);
+void cgroup_writeback_umount(void);
/**
* inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
{
}
+static inline void cgroup_writeback_umount(void)
+{
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*