[PATCH 7/8] blk-mq: create hctx for each present CPU

Wed Jun 7 02:10:46 PDT 2017

On Sat, Jun 03, 2017 at 04:04:02PM +0200, Christoph Hellwig wrote:
> Currently we only create hctx for online CPUs, which can lead to a lot
> of churn due to frequent soft offline / online operations.  Instead
> allocate one for each present CPU to avoid this and dramatically simplify
> the code.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  block/blk-mq.c             | 120 +++++----------------------------------------
>  block/blk-mq.h             |   5 --
>  include/linux/cpuhotplug.h |   1 -
>  3 files changed, 11 insertions(+), 115 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 1bcccedcc74f..66ca9a090984 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -37,9 +37,6 @@
>  #include "blk-wbt.h"
>  #include "blk-mq-sched.h"
>  
> -static DEFINE_MUTEX(all_q_mutex);
> -static LIST_HEAD(all_q_list);
> -
>  static void blk_mq_poll_stats_start(struct request_queue *q);
>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>  static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
> @@ -1966,8 +1963,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>  		INIT_LIST_HEAD(&__ctx->rq_list);
>  		__ctx->queue = q;
>  
> -		/* If the cpu isn't online, the cpu is mapped to first hctx */
> -		if (!cpu_online(i))
> +		/* If the cpu isn't present, the cpu is mapped to first hctx */
> +		if (!cpu_present(i))
>  			continue;
>  
>  		hctx = blk_mq_map_queue(q, i);
> @@ -2010,8 +2007,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
>  	}
>  }
>  
> -static void blk_mq_map_swqueue(struct request_queue *q,
> -			       const struct cpumask *online_mask)
> +static void blk_mq_map_swqueue(struct request_queue *q)
>  {
>  	unsigned int i, hctx_idx;
>  	struct blk_mq_hw_ctx *hctx;
> @@ -2029,13 +2025,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
>  	}
>  
>  	/*
> -	 * Map software to hardware queues
> +	 * Map software to hardware queues.
> +	 *
> +	 * If the cpu isn't present, the cpu is mapped to first hctx.
>  	 */
> -	for_each_possible_cpu(i) {
> -		/* If the cpu isn't online, the cpu is mapped to first hctx */
> -		if (!cpumask_test_cpu(i, online_mask))
> -			continue;
> -
> +	for_each_present_cpu(i) {
>  		hctx_idx = q->mq_map[i];
>  		/* unmapped hw queue can be remapped after CPU topo changed */
>  		if (!set->tags[hctx_idx] &&
> @@ -2321,16 +2315,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  		blk_queue_softirq_done(q, set->ops->complete);
>  
>  	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
> -
> -	get_online_cpus();
> -	mutex_lock(&all_q_mutex);
> -
> -	list_add_tail(&q->all_q_node, &all_q_list);
>  	blk_mq_add_queue_tag_set(set, q);
> -	blk_mq_map_swqueue(q, cpu_online_mask);
> -
> -	mutex_unlock(&all_q_mutex);
> -	put_online_cpus();
> +	blk_mq_map_swqueue(q);
>  
>  	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
>  		int ret;
> @@ -2356,18 +2342,12 @@ void blk_mq_free_queue(struct request_queue *q)
>  {
>  	struct blk_mq_tag_set	*set = q->tag_set;
>  
> -	mutex_lock(&all_q_mutex);
> -	list_del_init(&q->all_q_node);
> -	mutex_unlock(&all_q_mutex);
> -
>  	blk_mq_del_queue_tag_set(q);
> -
>  	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
>  }
>  
>  /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q,
> -				const struct cpumask *online_mask)
> +static void blk_mq_queue_reinit(struct request_queue *q)
>  {
>  	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>  
> @@ -2380,76 +2360,12 @@ static void blk_mq_queue_reinit(struct request_queue *q,
>  	 * involves free and re-allocate memory, worthy doing?)
>  	 */
>  
> -	blk_mq_map_swqueue(q, online_mask);
> +	blk_mq_map_swqueue(q);
>  
>  	blk_mq_sysfs_register(q);
>  	blk_mq_debugfs_register_hctxs(q);
>  }
>  
> -/*
> - * New online cpumask which is going to be set in this hotplug event.
> - * Declare this cpumasks as global as cpu-hotplug operation is invoked
> - * one-by-one and dynamically allocating this could result in a failure.
> - */
> -static struct cpumask cpuhp_online_new;
> -
> -static void blk_mq_queue_reinit_work(void)
> -{
> -	struct request_queue *q;
> -
> -	mutex_lock(&all_q_mutex);
> -	/*
> -	 * We need to freeze and reinit all existing queues.  Freezing
> -	 * involves synchronous wait for an RCU grace period and doing it
> -	 * one by one may take a long time.  Start freezing all queues in
> -	 * one swoop and then wait for the completions so that freezing can
> -	 * take place in parallel.
> -	 */
> -	list_for_each_entry(q, &all_q_list, all_q_node)
> -		blk_freeze_queue_start(q);
> -	list_for_each_entry(q, &all_q_list, all_q_node)
> -		blk_mq_freeze_queue_wait(q);
> -
> -	list_for_each_entry(q, &all_q_list, all_q_node)
> -		blk_mq_queue_reinit(q, &cpuhp_online_new);
> -
> -	list_for_each_entry(q, &all_q_list, all_q_node)
> -		blk_mq_unfreeze_queue(q);
> -
> -	mutex_unlock(&all_q_mutex);
> -}
> -
> -static int blk_mq_queue_reinit_dead(unsigned int cpu)
> -{
> -	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
> -	blk_mq_queue_reinit_work();
> -	return 0;
> -}
> -
> -/*
> - * Before hotadded cpu starts handling requests, new mappings must be
> - * established.  Otherwise, these requests in hw queue might never be
> - * dispatched.
> - *
> - * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
> - * for CPU0, and ctx1 for CPU1).
> - *
> - * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
> - * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
> - *
> - * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
> - * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
> - * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
> - * ignored.
> - */
> -static int blk_mq_queue_reinit_prepare(unsigned int cpu)
> -{
> -	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
> -	cpumask_set_cpu(cpu, &cpuhp_online_new);
> -	blk_mq_queue_reinit_work();
> -	return 0;
> -}
> -
>  static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
>  {
>  	int i;
> @@ -2660,7 +2576,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  	blk_mq_update_queue_map(set);
>  	list_for_each_entry(q, &set->tag_list, tag_set_list) {
>  		blk_mq_realloc_hw_ctxs(set, q);
> -		blk_mq_queue_reinit(q, cpu_online_mask);
> +		blk_mq_queue_reinit(q);
>  	}
>  
>  	list_for_each_entry(q, &set->tag_list, tag_set_list)
> @@ -2876,24 +2792,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_poll);
>  
> -void blk_mq_disable_hotplug(void)
> -{
> -	mutex_lock(&all_q_mutex);
> -}
> -
> -void blk_mq_enable_hotplug(void)
> -{
> -	mutex_unlock(&all_q_mutex);
> -}
> -
>  static int __init blk_mq_init(void)
>  {
>  	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
>  				blk_mq_hctx_notify_dead);
> -
> -	cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
> -				  blk_mq_queue_reinit_prepare,
> -				  blk_mq_queue_reinit_dead);

Hi Christoph,

One thing not sure is that we may need to handle new CPU hotplug
after initialization. Without the CPU hotplug handler, system may
not scale well when more CPUs are added to sockets.

Another thing is that I don't see how NVMe handles this situation,
blk_mq_update_nr_hw_queues() is called in nvme_reset_work(), so
that means RESET need to be triggered after new CPUs are added to
system? I have tried to add new CPUs runtime on Qemu, and not see
new hw queues are added no matter this patchset is applied or not.

Thanks,
Ming