diff --git a/block/blk-mq.c b/block/blk-mq.c index b1d81839679f..9692fa4c3ef2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4970,6 +4970,60 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +/* + * Switch back to the elevator type stored in the xarray. + */ +static void blk_mq_elv_switch_back(struct request_queue *q, + struct xarray *elv_tbl) +{ + struct elevator_type *e = xa_load(elv_tbl, q->id); + + /* The elv_update_nr_hw_queues unfreezes the queue. */ + elv_update_nr_hw_queues(q, e); + + /* Drop the reference acquired in blk_mq_elv_switch_none. */ + if (e) + elevator_put(e); +} + +/* + * Stores elevator type in xarray and set current elevator to none. It uses + * q->id as an index to store the elevator type into the xarray. + */ +static int blk_mq_elv_switch_none(struct request_queue *q, + struct xarray *elv_tbl) +{ + int ret = 0; + + lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); + + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because we're called from nr_hw_queue update which is protected by + * set->update_nr_hwq_lock in the writer context. So, scheduler update/ + * switch code (which acquires the same lock in the reader context) + * can't run concurrently. + */ + if (q->elevator) { + + ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); + if (WARN_ON_ONCE(ret)) + return ret; + + /* + * Before we switch elevator to 'none', take a reference to + * the elevator module so that while nr_hw_queue update is + * running, no one can remove elevator module. We'd put the + * reference to elevator module later when we switch back + * elevator. + */ + __elevator_get(q->elevator->type); + + elevator_set_none(q); + } + return ret; +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { @@ -4977,6 +5031,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; + struct xarray elv_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -4988,6 +5043,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&elv_tbl); + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4996,11 +5054,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue_nomemrestore(q); - goto reregister; - } + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (blk_mq_elv_switch_none(q, &elv_tbl)) + goto switch_back; + + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) + goto switch_back; fallback: blk_mq_update_queue_map(set); @@ -5020,12 +5084,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } blk_mq_map_swqueue(q); } - - /* elv_update_nr_hw_queues() unfreeze queue for us */ +switch_back: + /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + blk_mq_elv_switch_back(q, &elv_tbl); -reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); @@ -5033,6 +5096,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } + + xa_destroy(&elv_tbl); + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk.h b/block/blk.h index 468aa83c5a22..76901a39997f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -330,7 +330,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index ab22542e6cf0..9d81a06db6ec 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -689,21 +689,21 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) { struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); - mutex_lock(&q->elevator_lock); - if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { - ctx.name = q->elevator->type->elevator_name; + if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = e->elevator_name; + mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ ret = elevator_switch(q, &ctx); + mutex_unlock(&q->elevator_lock); } - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx));