You need to sign in or sign up before continuing.
Newer
Older
list_cut_before(&rq_list, &list, pos);
trace_block_unplug(head_rq->q, depth, !from_schedule);
blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
} while(!list_empty(&list));
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
unsigned int nr_segs)
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
WARN_ON_ONCE(err);
blk_account_io_start(rq);
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie, bool last)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
blk_qc_t new_cookie;
blk_status_t ret;
new_cookie = request_to_qc_t(hctx, rq);
/*
* For OK queue, we are done. For error, caller may kill it.
* Any other error (busy), just add it to our list as we
* previously would have done.
*/
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
return ret;
}
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
bool bypass_insert, bool last)
{
struct request_queue *q = rq->q;
int budget_token;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
*
* When queue is stopped or quiesced, ignore 'bypass_insert' from
* blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
* and avoid driver to try to dispatch again.
*/
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
bypass_insert = false;
goto insert;
if (q->elevator && !bypass_insert)
goto insert;
budget_token = blk_mq_get_dispatch_budget(q);
if (budget_token < 0)
goto insert;
blk_mq_set_rq_budget_token(rq, budget_token);
blk_mq_put_dispatch_budget(q, budget_token);
goto insert;
return __blk_mq_issue_directly(hctx, rq, cookie, last);
insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
blk_mq_sched_insert_request(rq, false, run_queue, false);
return BLK_STS_OK;
}
/**
* blk_mq_try_issue_directly - Try to send a request directly to device driver.
* @hctx: Pointer of the associated hardware queue.
* @rq: Pointer to request to be sent.
* @cookie: Request queue cookie.
*
* If the device has enough resources to accept a new request now, send the
* request directly to device driver. Else, insert at hctx->dispatch queue, so
* we can try send it another time in the future. Requests inserted at this
* queue have higher priority.
*/
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
blk_status_t ret;
int srcu_idx;
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
hctx_unlock(hctx, srcu_idx);
}
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
int srcu_idx;
blk_qc_t unused_cookie;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
while (!list_empty(list)) {
blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
queuelist);
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq, list_empty(list));
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
blk_mq_request_bypass_insert(rq, false,
break;
}
blk_mq_end_request(rq, ret);
/*
* If we didn't flush the entire list, we could have told
* the driver there was more coming, but that turned out to
* be a lie.
*/
if ((!list_empty(list) || errors) &&
hctx->queue->mq_ops->commit_rqs && queued)
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
list_add_tail(&rq->queuelist, &plug->mq_list);
plug->rq_count++;
if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
struct request *tmp;
tmp = list_first_entry(&plug->mq_list, struct request,
queuelist);
if (tmp->q != rq->q)
plug->multiple_queues = true;
}
}
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
* requests.
*/
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
if (plug->multiple_queues)
return BLK_MAX_REQUEST_COUNT * 2;
return BLK_MAX_REQUEST_COUNT;
}
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
* request may not be queued directly to hardware if:
* * This request can be merged with another one
* * We want to place request at plug queue for possible future merging
* * There is an IO scheduler active at this queue
*
* It will not queue the request if there is an error with the bio, or at the
* request creation.
*
* Returns: Request queue cookie.
*/
blk_qc_t blk_mq_submit_bio(struct bio *bio)
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
unsigned int nr_segs;
blk_qc_t cookie;
blk_queue_bounce(q, &bio);
__blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
Christoph Hellwig
committed
goto queue_exit;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
Christoph Hellwig
committed
goto queue_exit;
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
Christoph Hellwig
committed
goto queue_exit;
rq_qos_throttle(q, bio);
plug = blk_mq_plug(q, bio);
if (plug && plug->cached_rq) {
rq = plug->cached_rq;
plug->cached_rq = rq->rq_next;
INIT_LIST_HEAD(&rq->queuelist);
} else {
struct blk_mq_alloc_data data = {
.q = q,
.nr_tags = 1,
.cmd_flags = bio->bi_opf,
};
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
data.cached_rq = &plug->cached_rq;
}
rq = __blk_mq_alloc_requests(&data);
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
goto queue_exit;
}
trace_block_getrq(bio);
cookie = request_to_qc_t(rq->mq_hctx, rq);
blk_mq_bio_to_request(rq, bio, nr_segs);
ret = blk_crypto_init_request(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
blk_mq_free_request(rq);
return BLK_QC_T_NONE;
}
if (unlikely(is_flush_fua)) {
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
/* Bypass scheduler for flush requests */
blk_insert_flush(rq);
blk_mq_run_hw_queue(hctx, true);
} else if (plug && (q->nr_hw_queues == 1 ||
blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
*
* Use normal plugging if this disk is slow HDD, as sequential
* IO may benefit a lot from plug merging.
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
else
last = list_entry_rq(plug->mq_list.prev);
if (request_count >= blk_plug_max_rq_count(plug) || (last &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
blk_add_rq_to_plug(plug, rq);
/* Insert the request at the IO scheduler queue */
blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
* The plug list might get flushed before this. If that happens,
* the plug list is empty, and same_queue_rq is invalid.
if (list_empty(&plug->mq_list))
same_queue_rq = NULL;
if (same_queue_rq) {
list_del_init(&same_queue_rq->queuelist);
plug->rq_count--;
}
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
trace_block_unplug(q, 1, true);
blk_mq_try_issue_directly(same_queue_rq->mq_hctx,
same_queue_rq, &cookie);
} else if ((q->nr_hw_queues > 1 && is_sync) ||
/*
* There is no scheduler and we can try to send directly
* to the hardware.
*/
blk_mq_try_issue_directly(rq->mq_hctx, rq, &cookie);
blk_mq_sched_insert_request(rq, false, true, true);
return cookie;
Christoph Hellwig
committed
queue_exit:
blk_queue_exit(q);
return BLK_QC_T_NONE;
static size_t order_to_size(unsigned int order)
{
return (size_t)PAGE_SIZE << order;
}
/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
struct blk_mq_tags *tags)
{
struct page *page;
unsigned long flags;
/* There is no need to clear a driver tags own mapping */
if (drv_tags == tags)
return;
list_for_each_entry(page, &tags->page_list, lru) {
unsigned long start = (unsigned long)page_address(page);
unsigned long end = start + order_to_size(page->private);
int i;
for (i = 0; i < drv_tags->nr_tags; i++) {
struct request *rq = drv_tags->rqs[i];
unsigned long rq_addr = (unsigned long)rq;
if (rq_addr >= start && rq_addr < end) {
WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
cmpxchg(&drv_tags->rqs[i], rq, NULL);
}
}
}
/*
* Wait until all pending iteration is done.
*
* Request reference is cleared and it is guaranteed to be observed
* after the ->lock is released.
*/
spin_lock_irqsave(&drv_tags->lock, flags);
spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
struct blk_mq_tags *drv_tags;
struct page *page;
if (blk_mq_is_shared_tags(set->flags))
drv_tags = set->shared_tags;
else
drv_tags = set->tags[hctx_idx];
if (tags->static_rqs && set->ops->exit_request) {
for (i = 0; i < tags->nr_tags; i++) {
struct request *rq = tags->static_rqs[i];
if (!rq)
set->ops->exit_request(set, rq, hctx_idx);
blk_mq_clear_rq_mapping(drv_tags, tags);
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
/*
* Remove kmemleak object previously allocated in
*/
kmemleak_free(page_address(page));
__free_pages(page, page->private);
}
void blk_mq_free_rq_map(struct blk_mq_tags *tags)
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
blk_mq_free_tags(tags);
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags)
struct blk_mq_tags *tags;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
if (!tags->rqs) {
blk_mq_free_tags(tags);
return NULL;
}
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags);
return tags;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, int node)
{
int ret;
if (set->ops->init_request) {
ret = set->ops->init_request(set, rq, hctx_idx, node);
if (ret)
return ret;
}
return 0;
}
static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
INIT_LIST_HEAD(&tags->page_list);
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
*/
rq_size = round_up(sizeof(struct request) + set->cmd_size,
left = rq_size * depth;
for (i = 0; i < depth; ) {
int this_order = max_order;
struct page *page;
int to_do;
void *p;
while (this_order && left < order_to_size(this_order - 1))
this_order--;
do {
page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
if (page)
break;
if (!this_order--)
break;
if (order_to_size(this_order) < rq_size)
break;
} while (1);
if (!page)
goto fail;
page->private = this_order;
list_add_tail(&page->lru, &tags->page_list);
p = page_address(page);
/*
* Allow kmemleak to scan these pages as they contain pointers
* to additional allocations like via ops->init_request().
*/
kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
entries_per_page = order_to_size(this_order) / rq_size;
to_do = min(entries_per_page, depth - i);
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
struct request *rq = p;
tags->static_rqs[i] = rq;
if (blk_mq_init_request(set, rq, hctx_idx, node)) {
tags->static_rqs[i] = NULL;
goto fail;
p += rq_size;
i++;
}
}
blk_mq_free_rqs(set, tags, hctx_idx);
return -ENOMEM;
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
struct rq_iter_data {
struct blk_mq_hw_ctx *hctx;
bool has_rq;
};
static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
{
struct rq_iter_data *iter_data = data;
if (rq->mq_hctx != iter_data->hctx)
return true;
iter_data->has_rq = true;
return false;
}
static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->sched_tags ?
hctx->sched_tags : hctx->tags;
struct rq_iter_data data = {
.hctx = hctx,
};
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
return data.has_rq;
}
static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
struct blk_mq_hw_ctx *hctx)
{
if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
return false;
if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
return false;
return true;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
!blk_mq_last_cpu_in_hctx(cpu, hctx))
return 0;
/*
* Prevent new request from being allocated on the current hctx.
*
* The smp_mb__after_atomic() Pairs with the implied barrier in
* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
* seen once we return from the tag allocator.
*/
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
smp_mb__after_atomic();
/*
* Try to grab a reference to the queue and wait for any outstanding
* requests. If we could not grab a reference the queue has been
* frozen and there are no requests.
*/
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
while (blk_mq_hctx_has_requests(hctx))
msleep(5);
percpu_ref_put(&hctx->queue->q_usage_counter);
}
return 0;
}
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (cpumask_test_cpu(cpu, hctx->cpumask))
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
* gets run.
*/
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
if (!cpumask_test_cpu(cpu, hctx->cpumask))
return 0;
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
blk_mq_hctx_clear_pending(hctx, ctx);
}
spin_unlock(&ctx->lock);
if (list_empty(&tmp))
spin_lock(&hctx->lock);
list_splice_tail_init(&tmp, &hctx->dispatch);
spin_unlock(&hctx->lock);
blk_mq_run_hw_queue(hctx, true);
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
/*
* Before freeing hw queue, clearing the flush request reference in
* tags->rqs[] for avoiding potential UAF.
*/
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
unsigned int queue_depth, struct request *flush_rq)
{
int i;
unsigned long flags;
/* The hw queue may not be mapped yet */
if (!tags)
return;
WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
/*
* Wait until all pending iteration is done.
*
* Request reference is cleared and it is guaranteed to be observed
* after the ->lock is released.
*/
spin_lock_irqsave(&tags->lock, flags);
spin_unlock_irqrestore(&tags->lock, flags);
}
/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct request *flush_rq = hctx->fq->flush_rq;
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
set->queue_depth, flush_rq);
if (set->ops->exit_request)
set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
blk_mq_remove_cpuhp(hctx);
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
__alignof__(struct blk_mq_hw_ctx)) !=
sizeof(struct blk_mq_hw_ctx));
if (tag_set->flags & BLK_MQ_F_BLOCKING)
hw_ctx_size += sizeof(struct srcu_struct);
return hw_ctx_size;
}
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
hctx->queue_num = hctx_idx;
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto unregister_cpu_notifier;
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
return 0;
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
unregister_cpu_notifier:
blk_mq_remove_cpuhp(hctx);
return -1;
}
static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
int node)
{
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
goto free_hctx;
atomic_set(&hctx->nr_active, 0);
node = set->numa_node;
hctx->numa_node = node;
INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
INIT_LIST_HEAD(&hctx->hctx_list);
* Allocate space for all possible cpus to avoid allocation at
* runtime
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
goto free_ctxs;
hctx->nr_ctx = 0;
spin_lock_init(&hctx->dispatch_wait_lock);
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
if (hctx->flags & BLK_MQ_F_BLOCKING)
init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
free_cpumask:
free_cpumask_var(hctx->cpumask);
free_hctx:
kfree(hctx);
fail_alloc_hctx:
return NULL;
static void blk_mq_init_cpu_queues(struct request_queue *q,
unsigned int nr_hw_queues)
{
struct blk_mq_tag_set *set = q->tag_set;
unsigned int i, j;
for_each_possible_cpu(i) {
struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
struct blk_mq_hw_ctx *hctx;
__ctx->cpu = i;
spin_lock_init(&__ctx->lock);
for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
INIT_LIST_HEAD(&__ctx->rq_lists[k]);
__ctx->queue = q;
/*
* Set local node, IFF we have more than one hw queue. If
* not, we remain on the home node of the device
*/
for (j = 0; j < set->nr_maps; j++) {
hctx = blk_mq_map_queue_type(q, j, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = cpu_to_node(i);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int depth)
struct blk_mq_tags *tags;
int ret;
tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
if (ret) {
blk_mq_free_rq_map(tags);
return tags;
}
static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
int hctx_idx)
{
if (blk_mq_is_shared_tags(set->flags)) {
set->tags[hctx_idx] = set->shared_tags;
return true;
}
set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
set->queue_depth);
return set->tags[hctx_idx];
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx)
if (tags) {
blk_mq_free_rqs(set, tags, hctx_idx);
blk_mq_free_rq_map(tags);
static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx)
{
if (!blk_mq_is_shared_tags(set->flags))
blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
set->tags[hctx_idx] = NULL;
}
static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
queue_for_each_hw_ctx(q, hctx, i) {