nvme-multipath: Add visibility for round-robin io-policy

This patch helps add nvme native multipath visibility for round-robin
io-policy. It creates a "multipath" sysfs directory under head gendisk
device node directory and then from "multipath" directory it adds a link
to each namespace path device the head node refers.

For instance, if we have a shared namespace accessible from two different
controllers/paths then we create a soft link to each path device from head
disk node as shown below:

$ ls -l /sys/block/nvme1n1/multipath/
nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1
nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1

In the above example, nvme1n1 is head gendisk node created for a shared
namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1
paths.

For round-robin I/O policy, we could easily infer from the above output
that I/O workload targeted to nvme1n1 would toggle across paths nvme1c1n1
and nvme1c3n1.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
This commit is contained in:
Nilay Shroff 2025-01-12 18:11:44 +05:30 committed by Keith Busch
parent 316dabe608
commit 4dbd2b2ebe
4 changed files with 130 additions and 4 deletions

View File

@ -4020,6 +4020,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (!nvme_ns_head_multipath(ns->head))
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
nvme_mpath_remove_sysfs_link(ns);
del_gendisk(ns->disk);
mutex_lock(&ns->ctrl->namespaces_lock);

View File

@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
kblockd_schedule_work(&head->partition_scan_work);
}
nvme_mpath_add_sysfs_link(ns->head);
mutex_lock(&head->lock);
if (nvme_path_is_optimized(ns)) {
int node, srcu_idx;
@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
if (nvme_state_is_live(ns->ana_state) &&
nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns);
else {
/*
* Add sysfs link from multipath head gendisk node to path
* device gendisk node.
* If path's ana state is live (i.e. state is either optimized
* or non-optimized) while we alloc the ns then sysfs link would
* be created from nvme_mpath_set_live(). In that case we would
* not fallthrough this code path. However for the path's ana
* state other than live, we call nvme_mpath_set_live() only
* after ana state transitioned to the live state. But we still
* want to create the sysfs link from head node to a path device
* irrespctive of the path's ana state.
* If we reach through here then it means that path's ana state
* is not live but still create the sysfs link to this path from
* head node if head node of the path has already come alive.
*/
if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
nvme_mpath_add_sysfs_link(ns->head);
}
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@ -967,6 +988,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
return -ENXIO; /* just break out of the loop */
}
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
{
struct device *target;
int rc, srcu_idx;
struct nvme_ns *ns;
struct kobject *kobj;
/*
* Ensure head disk node is already added otherwise we may get invalid
* kobj for head disk node
*/
if (!test_bit(GD_ADDED, &head->disk->state))
return;
kobj = &disk_to_dev(head->disk)->kobj;
/*
* loop through each ns chained through the head->list and create the
* sysfs link from head node to the ns path node
*/
srcu_idx = srcu_read_lock(&head->srcu);
list_for_each_entry_rcu(ns, &head->list, siblings) {
/*
* Avoid creating link if it already exists for the given path.
* When path ana state transitions from optimized to non-
* optimized or vice-versa, the nvme_mpath_set_live() is
* invoked which in truns call this function. Now if the sysfs
* link already exists for the given path and we attempt to re-
* create the link then sysfs code would warn about it loudly.
* So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
* that we're not creating duplicate link.
* The test_and_set_bit() is used because it is protecting
* against multiple nvme paths being simultaneously added.
*/
if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
continue;
/*
* Ensure that ns path disk node is already added otherwise we
* may get invalid kobj name for target
*/
if (!test_bit(GD_ADDED, &ns->disk->state))
continue;
target = disk_to_dev(ns->disk);
/*
* Create sysfs link from head gendisk kobject @kobj to the
* ns path gendisk kobject @target->kobj.
*/
rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
&target->kobj, dev_name(target));
if (unlikely(rc)) {
dev_err(disk_to_dev(ns->head->disk),
"failed to create link to %s\n",
dev_name(target));
clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
}
}
srcu_read_unlock(&head->srcu, srcu_idx);
}
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
{
struct device *target;
struct kobject *kobj;
if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
return;
target = disk_to_dev(ns->disk);
kobj = &disk_to_dev(ns->head->disk)->kobj;
sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
dev_name(target));
clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
}
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {

View File

@ -534,10 +534,11 @@ struct nvme_ns {
struct nvme_ns_head *head;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_ANA_PENDING 2
#define NVME_NS_FORCE_RO 3
#define NVME_NS_READY 4
#define NVME_NS_REMOVING 0
#define NVME_NS_ANA_PENDING 2
#define NVME_NS_FORCE_RO 3
#define NVME_NS_READY 4
#define NVME_NS_SYSFS_ATTR_LINK 5
struct cdev cdev;
struct device cdev_device;
@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_attr_groups[];
extern const struct attribute_group nvme_ns_mpath_attr_group;
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
@ -1009,6 +1013,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
{
}
static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
{
}
static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
return false;

View File

@ -299,8 +299,22 @@ static const struct attribute_group nvme_ns_attr_group = {
.is_visible = nvme_ns_attrs_are_visible,
};
#ifdef CONFIG_NVME_MULTIPATH
static struct attribute *nvme_ns_mpath_attrs[] = {
NULL,
};
const struct attribute_group nvme_ns_mpath_attr_group = {
.name = "multipath",
.attrs = nvme_ns_mpath_attrs,
};
#endif
const struct attribute_group *nvme_ns_attr_groups[] = {
&nvme_ns_attr_group,
#ifdef CONFIG_NVME_MULTIPATH
&nvme_ns_mpath_attr_group,
#endif
NULL,
};