From 32acab3181c7053c775ca128c3a5c6ce50197d7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 12:59:30 +0100 Subject: nvme: implement multipath access to nvme subsystems This patch adds native multipath support to the nvme driver. For each namespace we create only single block device node, which can be used to access that namespace through any of the controllers that refer to it. The gendisk for each controllers path to the name space still exists inside the kernel, but is hidden from userspace. The character device nodes are still available on a per-controller basis. A new link from the sysfs directory for the subsystem allows to find all controllers for a given subsystem. Currently we will always send I/O to the first available path, this will be changed once the NVMe Asynchronous Namespace Access (ANA) TP is ratified and implemented, at which point we will look at the ANA state for each namespace. Another possibility that was prototyped is to use the path that is closes to the submitting NUMA code, which will be mostly interesting for PCI, but might also be useful for RDMA or FC transports in the future. There is not plan to implement round robin or I/O service time path selectors, as those are not scalable with the performance rates provided by NVMe. The multipath device will go away once all paths to it disappear, any delay to keep it alive needs to be implemented at the controller level. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 255 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 drivers/nvme/host/multipath.c (limited to 'drivers/nvme/host/multipath.c') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c new file mode 100644 index 000000000000..062754ebebfd --- /dev/null +++ b/drivers/nvme/host/multipath.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2017 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include "nvme.h" + +static bool multipath = true; +module_param(multipath, bool, 0644); +MODULE_PARM_DESC(multipath, + "turn on native support for multiple controllers per subsystem"); + +void nvme_failover_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long flags; + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + + nvme_reset_ctrl(ns->ctrl); + kblockd_schedule_work(&ns->head->requeue_work); +} + +bool nvme_req_needs_failover(struct request *req) +{ + if (!(req->cmd_flags & REQ_NVME_MPATH)) + return false; + + switch (nvme_req(req)->status & 0x7ff) { + /* + * Generic command status: + */ + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: + case NVME_SC_LBA_RANGE: + case NVME_SC_CAP_EXCEEDED: + case NVME_SC_RESERVATION_CONFLICT: + return false; + + /* + * I/O command set specific error. Unfortunately these values are + * reused for fabrics commands, but those should never get here. + */ + case NVME_SC_BAD_ATTRIBUTES: + case NVME_SC_INVALID_PI: + case NVME_SC_READ_ONLY: + case NVME_SC_ONCS_NOT_SUPPORTED: + WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == + nvme_fabrics_command); + return false; + + /* + * Media and Data Integrity Errors: + */ + case NVME_SC_WRITE_FAULT: + case NVME_SC_READ_ERROR: + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_COMPARE_FAILED: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_UNWRITTEN_BLOCK: + return false; + } + + /* Everything else could be a path failure, so should be retried */ + return true; +} + +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->disk) + kblockd_schedule_work(&ns->head->requeue_work); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (ns->ctrl->state == NVME_CTRL_LIVE) { + rcu_assign_pointer(head->current_path, ns); + return ns; + } + } + + return NULL; +} + +inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); + + if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + ns = __nvme_find_path(head); + return ns; +} + +static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, + struct bio *bio) +{ + struct nvme_ns_head *head = q->queuedata; + struct device *dev = disk_to_dev(head->disk); + struct nvme_ns *ns; + blk_qc_t ret = BLK_QC_T_NONE; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (likely(ns)) { + bio->bi_disk = ns->disk; + bio->bi_opf |= REQ_NVME_MPATH; + ret = direct_make_request(bio); + } else if (!list_empty_careful(&head->list)) { + dev_warn_ratelimited(dev, "no path available - requeing I/O\n"); + + spin_lock_irq(&head->requeue_lock); + bio_list_add(&head->requeue_list, bio); + spin_unlock_irq(&head->requeue_lock); + } else { + dev_warn_ratelimited(dev, "no path - failing I/O\n"); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } + + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) +{ + struct nvme_ns_head *head = q->queuedata; + struct nvme_ns *ns; + bool found = false; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = srcu_dereference(head->current_path, &head->srcu); + if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + found = ns->queue->poll_fn(q, qc); + srcu_read_unlock(&head->srcu, srcu_idx); + return found; +} + +static void nvme_requeue_work(struct work_struct *work) +{ + struct nvme_ns_head *head = + container_of(work, struct nvme_ns_head, requeue_work); + struct bio *bio, *next; + + spin_lock_irq(&head->requeue_lock); + next = bio_list_get(&head->requeue_list); + spin_unlock_irq(&head->requeue_lock); + + while ((bio = next) != NULL) { + next = bio->bi_next; + bio->bi_next = NULL; + + /* + * Reset disk to the mpath node and resubmit to select a new + * path. + */ + bio->bi_disk = head->disk; + generic_make_request(bio); + } +} + +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) +{ + struct request_queue *q; + bool vwc = false; + + bio_list_init(&head->requeue_list); + spin_lock_init(&head->requeue_lock); + INIT_WORK(&head->requeue_work, nvme_requeue_work); + + /* + * Add a multipath node if the subsystems supports multiple controllers. + * We also do this for private namespaces as the namespace sharing data could + * change after a rescan. + */ + if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) + return 0; + + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + if (!q) + goto out; + q->queuedata = head; + blk_queue_make_request(q, nvme_ns_head_make_request); + q->poll_fn = nvme_ns_head_poll; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* set to a default value for 512 until disk is validated */ + blk_queue_logical_block_size(q, 512); + + /* we need to propagate up the VMC settings */ + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); + + head->disk = alloc_disk(0); + if (!head->disk) + goto out_cleanup_queue; + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; + head->disk->queue = q; + head->disk->flags = GENHD_FL_EXT_DEVT; + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + return 0; + +out_cleanup_queue: + blk_cleanup_queue(q); +out: + return -ENOMEM; +} + +void nvme_mpath_add_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + device_add_disk(&head->subsys->dev, head->disk); +} + +void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + del_gendisk(head->disk); + blk_set_queue_dying(head->disk->queue); + /* make sure all pending bios are cleaned up */ + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); + blk_cleanup_queue(head->disk->queue); + put_disk(head->disk); +} -- cgit v1.2.3 From 5b85b826b8925b3f1910b6613eb82c4df240a5b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:51:03 +0100 Subject: nvme: also expose the namespace identification sysfs files for mpath nodes We do this by adding a helper that returns the ns_head for a device that can belong to either the per-controller or per-subsystem block device nodes, and otherwise reuse all the existing code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/nvme/host/multipath.c') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 062754ebebfd..850275896e49 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -239,12 +239,18 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) if (!head->disk) return; device_add_disk(&head->subsys->dev, head->disk); + if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + head->disk->disk_name); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); del_gendisk(head->disk); blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ -- cgit v1.2.3 From e9a48034d7d1318ece7d4a235838a86c94db9d68 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 9 Nov 2017 17:57:06 +0100 Subject: nvme: create 'slaves' and 'holders' entries for hidden controllers When creating nvme multipath devices we should populate the 'slaves' and 'holders' directorys properly to aid userspace topology detection. Signed-off-by: Hannes Reinecke [hch: split from a larger patch, compile fix for CONFIG_NVME_MULTIPATH=n] Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'drivers/nvme/host/multipath.c') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 850275896e49..78d92151a904 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -245,6 +245,25 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) head->disk->disk_name); } +void nvme_mpath_add_disk_links(struct nvme_ns *ns) +{ + struct kobject *slave_disk_kobj, *holder_disk_kobj; + + if (!ns->head->disk) + return; + + slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; + if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, + kobject_name(slave_disk_kobj))) + return; + + holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; + if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, + kobject_name(holder_disk_kobj))) + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(slave_disk_kobj)); +} + void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) @@ -259,3 +278,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +void nvme_mpath_remove_disk_links(struct nvme_ns *ns) +{ + if (!ns->head->disk) + return; + + sysfs_remove_link(ns->disk->part0.holder_dir, + kobject_name(&disk_to_dev(ns->head->disk)->kobj)); + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(&disk_to_dev(ns->disk)->kobj)); +} -- cgit v1.2.3