Newer
Older
cpumask_clear(hctx->cpumask);
hctx->dispatch_from = NULL;
* Map software to hardware queues.
*
* If the cpu isn't present, the cpu is mapped to first hctx.
for_each_possible_cpu(i) {
ctx = per_cpu_ptr(q->queue_ctx, i);
if (!set->map[j].nr_queues) {
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
hctx_idx = set->map[j].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
set->map[j].mq_map[i] = 0;
}
ctx->hctxs[j] = hctx;
/*
* If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if
* devices share queues across queue maps.
*/
if (cpumask_test_cpu(i, hctx->cpumask))
continue;
cpumask_set_cpu(i, hctx->cpumask);
hctx->type = j;
ctx->index_hw[hctx->type] = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
/*
* If the nr_ctx type overflows, we have exceeded the
* amount of sw queues we can support.
*/
BUG_ON(!hctx->nr_ctx);
}
for (; j < HCTX_MAX_TYPES; j++)
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
*/
if (!hctx->nr_ctx) {
/* Never unmap queue 0. We need it as a
* fallback in case of a new remap fails
* allocation
*/
if (i)
__blk_mq_free_map_and_rqs(set, i);
hctx->tags = NULL;
continue;
}
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
* over all possibly mapped software queues.
*/
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
/*
* Caller needs to ensure that we're either frozen/quiesced, or that
* the queue isn't live yet.
*/
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
} else {
blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared)
{
struct request_queue *q;
lockdep_assert_held(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
blk_mq_unfreeze_queue(q);
}
}
static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
blk_mq_update_tag_set_shared(set, false);
mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list);
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
struct request_queue *q)
{
mutex_lock(&set->tag_list_lock);
/*
* Check to see if we're transitioning to shared (from 1 to 2 queues).
*/
if (!list_empty(&set->tag_list) &&
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
blk_mq_update_tag_set_shared(set, true);
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true);
list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
struct blk_mq_ctxs *ctxs;
int cpu;
ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
if (!ctxs)
return -ENOMEM;
ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!ctxs->queue_ctx)
goto fail;
for_each_possible_cpu(cpu) {
struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
ctx->ctxs = ctxs;
}
q->mq_kobj = &ctxs->kobj;
q->queue_ctx = ctxs->queue_ctx;
return 0;
fail:
kfree(ctxs);
return -ENOMEM;
}
/*
* It is the actual release handler for mq, but we do it from
* request queue's release handler for avoiding use-after-free
* and headache because q->mq_kobj shouldn't have been introduced,
* but we can't group ctx/kctx kobj without it.
*/
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
int i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
/* all hctx are in .unused_hctx_list now */
list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
list_del_init(&hctx->hctx_list);
kobject_put(&hctx->kobj);
kfree(q->queue_hw_ctx);
/*
* release .mq_kobj and sw queue's kobject now because
* both share lifetime with request queue.
*/
blk_mq_sysfs_deinit(q);
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
{
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
blk_cleanup_queue(q);
return ERR_PTR(ret);
}
return q;
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass)
{
struct request_queue *q;
q = blk_mq_init_queue_data(set, queuedata);
if (IS_ERR(q))
return ERR_CAST(q);
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_cleanup_queue(q);
return ERR_PTR(-ENOMEM);
EXPORT_SYMBOL(__blk_mq_alloc_disk);
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
{
struct blk_mq_hw_ctx *hctx = NULL, *tmp;
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
if (tmp->numa_node == node) {
hctx = tmp;
break;
}
}
if (hctx)
list_del_init(&hctx->hctx_list);
spin_unlock(&q->unused_hctx_lock);
if (!hctx)
hctx = blk_mq_alloc_hctx(q, set, node);
if (!hctx)
if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
goto free_hctx;
return hctx;
free_hctx:
kobject_put(&hctx->kobj);
fail:
return NULL;
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
int i, j, end;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
if (q->nr_hw_queues < set->nr_hw_queues) {
struct blk_mq_hw_ctx **new_hctxs;
new_hctxs = kcalloc_node(set->nr_hw_queues,
sizeof(*new_hctxs), GFP_KERNEL,
set->numa_node);
if (!new_hctxs)
return;
if (hctxs)
memcpy(new_hctxs, hctxs, q->nr_hw_queues *
sizeof(*hctxs));
q->queue_hw_ctx = new_hctxs;
kfree(hctxs);
hctxs = new_hctxs;
}
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_hw_ctx *hctx;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
* to use the previous one.
*/
if (hctxs[i] && (hctxs[i]->numa_node == node))
continue;
hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
if (hctx) {
blk_mq_exit_hctx(q, set, hctxs[i], i);
hctxs[i] = hctx;
} else {
if (hctxs[i])
pr_warn("Allocate new hctx on node %d fails,\
fallback to previous one on node %d\n",
node, hctxs[i]->numa_node);
else
break;
/*
* Increasing nr_hw_queues fails. Free the newly allocated
* hctxs and keep the previous q->nr_hw_queues.
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
end = i;
} else {
j = i;
end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
for (; j < end; j++) {
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
__blk_mq_free_map_and_rqs(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
hctxs[j] = NULL;
}
}
mutex_unlock(&q->sysfs_lock);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
if (blk_mq_alloc_ctxs(q))
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (set->nr_maps > HCTX_TYPE_POLL &&
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth;
/*
* Default to classic polling
*/
Yufen Yu
committed
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
return 0;
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
return -ENOMEM;
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
blk_mq_del_queue_tag_set(q);
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
if (blk_mq_is_shared_tags(set->flags)) {
set->shared_tags = blk_mq_alloc_map_and_rqs(set,
BLK_MQ_NO_HCTX_IDX,
set->queue_depth);
if (!set->shared_tags)
return -ENOMEM;
}
for (i = 0; i < set->nr_hw_queues; i++) {
if (!__blk_mq_alloc_map_and_rqs(set, i))
goto out_unwind;
cond_resched();
}
return 0;
out_unwind:
while (--i >= 0)
__blk_mq_free_map_and_rqs(set, i);
if (blk_mq_is_shared_tags(set->flags)) {
blk_mq_free_map_and_rqs(set, set->shared_tags,
return -ENOMEM;
}
/*
* Allocate the request maps associated with this tag_set. Note that this
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
{
unsigned int depth;
int err;
depth = set->queue_depth;
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break;
set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
if (!set->queue_depth || err) {
pr_err("blk-mq: failed to allocate request map\n");
return -ENOMEM;
}
if (depth != set->queue_depth)
pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
depth, set->queue_depth);
return 0;
}
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
/*
* blk_mq_map_queues() and multiple .map_queues() implementations
* expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
* number of hardware queues.
*/
if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues && !is_kdump_kernel()) {
/*
* transport .map_queues is usually done in the following
* way:
*
* for (queue = 0; queue < set->nr_hw_queues; queue++) {
* mask = get_cpu_mask(queue)
* for_each_cpu(cpu, mask)
* }
*
* When we need to remap, the table has to be cleared for
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]);
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
int cur_nr_hw_queues, int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
if (cur_nr_hw_queues >= new_nr_hw_queues)
return 0;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
if (!new_tags)
return -ENOMEM;
if (set->tags)
memcpy(new_tags, set->tags, cur_nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
set->nr_hw_queues = new_nr_hw_queues;
return 0;
}
static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
int new_nr_hw_queues)
{
return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
}
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
* requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
if (!set->nr_hw_queues)
return -EINVAL;
return -EINVAL;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
return -EINVAL;
return -EINVAL;
if (!set->ops->get_budget ^ !set->ops->put_budget)
return -EINVAL;
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
set->queue_depth = BLK_MQ_MAX_DEPTH;
}
if (!set->nr_maps)
set->nr_maps = 1;
else if (set->nr_maps > HCTX_MAX_TYPES)
return -EINVAL;
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
* There is no use for more h/w queues than cpus if we just have
* a single map
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
return -ENOMEM;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
ret = blk_mq_update_queue_map(set);
if (ret)
goto out_free_mq_map;
ret = blk_mq_alloc_set_map_and_rqs(set);
goto out_free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map);
set->map[i].mq_map = NULL;
}
kfree(set->tags);
set->tags = NULL;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
/* allocate and initialize a tagset for a simple single-queue device */
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int queue_depth,
unsigned int set_flags)
{
memset(set, 0, sizeof(*set));
set->ops = ops;
set->nr_hw_queues = 1;
set->nr_maps = 1;
set->queue_depth = queue_depth;
set->numa_node = NUMA_NO_NODE;
set->flags = set_flags;
return blk_mq_alloc_tag_set(set);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
for (i = 0; i < set->nr_hw_queues; i++)
__blk_mq_free_map_and_rqs(set, i);
if (blk_mq_is_shared_tags(set->flags)) {
blk_mq_free_map_and_rqs(set, set->shared_tags,
BLK_MQ_NO_HCTX_IDX);
}
for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map);
set->map[j].mq_map = NULL;
}
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
int i, ret;
if (q->nr_requests == nr)
return 0;
blk_mq_quiesce_queue(q);
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
/*
* If we're using an MQ scheduler, just update the scheduler
* queue depth. This is similar to what the old code would do.
*/
if (hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
if (blk_mq_is_shared_tags(set->flags)) {
if (q->elevator)
blk_mq_tag_update_sched_shared_tags(q);
else
blk_mq_tag_resize_shared_tags(set, nr);
blk_mq_unquiesce_queue(q);
return ret;
}
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
/*
* request_queue and elevator_type pair.
* It is just used by __blk_mq_update_nr_hw_queues to cache
* the elevator_type associated with a request_queue.
*/
struct blk_mq_qe_pair {
struct list_head node;
struct request_queue *q;
struct elevator_type *type;
};
/*
* Cache the elevator_type in qe pair list and switch the
* io scheduler to 'none'
*/
static bool blk_mq_elv_switch_none(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
if (!q->elevator)
return true;
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
list_add(&qe->node, head);
mutex_lock(&q->sysfs_lock);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be
* removed.
*/
__module_get(qe->type->elevator_owner);
elevator_switch_mq(q, NULL);
mutex_unlock(&q->sysfs_lock);
return true;
}
static void blk_mq_elv_switch_back(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
struct elevator_type *t = NULL;
list_for_each_entry(qe, head, node)
if (qe->q == q) {
t = qe->type;
break;
}
if (!t)
return;
list_del(&qe->node);
kfree(qe);
mutex_lock(&q->sysfs_lock);
elevator_switch_mq(q, t);
mutex_unlock(&q->sysfs_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
LIST_HEAD(head);
int prev_nr_hw_queues;
lockdep_assert_held(&set->tag_list_lock);
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
if (nr_hw_queues < 1)
return;
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
if (!blk_mq_elv_switch_none(&head, q))
goto switch_back;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister(q);
}
prev_nr_hw_queues = set->nr_hw_queues;
if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
0)
goto reregister;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
}
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register(q);
blk_mq_debugfs_register_hctxs(q);
switch_back:
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
return true;
blk_stat_add_callback(q, q->poll_cb);
return false;
}
static void blk_mq_poll_stats_start(struct request_queue *q)
{
/*
* We don't arm the callback if polling stats are not enabled or the
* callback is already active.
*/
if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
blk_stat_is_active(q->poll_cb))
return;
blk_stat_activate_msecs(q->poll_cb, 100);
}
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
{
struct request_queue *q = cb->data;
for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
if (cb->stat[bucket].nr_samples)
q->poll_stat[bucket] = cb->stat[bucket];
}
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct request *rq)
{
unsigned long ret = 0;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
if (!blk_poll_stats_enable(q))
return 0;
/*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
* get closer than just half the mean. This is especially
* important on devices where the completion latencies are longer
* than ~10 usec. We do use the stats for the relevant IO size
* if available which does lead to better estimates.
bucket = blk_mq_poll_stats_bkt(rq);
if (bucket < 0)
return ret;
if (q->poll_stat[bucket].nr_samples)
ret = (q->poll_stat[bucket].mean + 1) / 2;
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
struct request *rq)
{
struct hrtimer_sleeper hs;
enum hrtimer_mode mode;
* If we get here, hybrid polling is enabled. Hence poll_nsec can be:
*
* 0: use half of prev avg
* >0: use this specific value
*/
if (q->poll_nsec > 0)
nsecs = blk_mq_poll_nsecs(q, rq);
/*
* This will be replaced with the stats tracking code, using