Newer
Older
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/delay.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
unsigned int i;
for (i = 0; i < hctx->ctx_map.map_size; i++)
if (hctx->ctx_map.map[i].word)
return true;
return false;
}
static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
}
#define CTX_TO_BIT(hctx, ctx) \
((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
struct blk_align_bitmap *bm = get_bm(hctx, ctx);
if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
struct blk_align_bitmap *bm = get_bm(hctx, ctx);
clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
}
static int blk_mq_queue_enter(struct request_queue *q)
{
if (percpu_ref_tryget_live(&q->mq_usage_counter))
return 0;
ret = wait_event_interruptible(q->mq_freeze_wq,
!q->mq_freeze_depth || blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
if (ret)
return ret;
}
}
static void blk_mq_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->mq_usage_counter);
}
static void blk_mq_usage_counter_release(struct percpu_ref *ref)
{
struct request_queue *q =
container_of(ref, struct request_queue, mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_mq_freeze_queue(struct request_queue *q)
spin_lock_irq(q->queue_lock);
q->mq_freeze_depth++;
spin_unlock_irq(q->queue_lock);
percpu_ref_kill(&q->mq_usage_counter);
blk_mq_run_queues(q, false);
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
static void blk_mq_unfreeze_queue(struct request_queue *q)
{
bool wake = false;
spin_lock_irq(q->queue_lock);
wake = !--q->mq_freeze_depth;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
spin_unlock_irq(q->queue_lock);
if (wake) {
percpu_ref_reinit(&q->mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
{
return blk_mq_has_free_tags(hctx->tags);
}
EXPORT_SYMBOL(blk_mq_can_queue);
static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
struct request *rq, unsigned int rw_flags)
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = q;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_CGROUP
rq->rl = NULL;
rq->io_start_time_ns = 0;
#endif
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
rq->special = NULL;
/* tag was already set */
rq->errors = 0;
rq->extra_len = 0;
rq->sense_len = 0;
rq->resid_len = 0;
rq->sense = NULL;
INIT_LIST_HEAD(&rq->timeout_list);
rq->timeout = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
rq->next_rq = NULL;
ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
}
static struct request *
__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
{
struct request *rq;
unsigned int tag;
if (tag != BLK_MQ_TAG_FAIL) {
rq->cmd_flags = 0;
rq->cmd_flags = REQ_MQ_INFLIGHT;
}
rq->tag = tag;
blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
return rq;
}
return NULL;
}
struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
bool reserved)
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_alloc_data alloc_data;
if (blk_mq_queue_enter(q))
return NULL;
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
reserved, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
if (!rq && (gfp & __GFP_WAIT)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
}
blk_mq_put_ctx(ctx);
EXPORT_SYMBOL(blk_mq_alloc_request);
static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx, struct request *rq)
{
const int tag = rq->tag;
struct request_queue *q = rq->q;
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
blk_mq_put_tag(hctx, tag, &ctx->last_tag);
blk_mq_queue_exit(q);
}
void blk_mq_free_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q = rq->q;
ctx->rq_completed[rq_is_sync(rq)]++;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
__blk_mq_free_request(hctx, ctx, rq);
}
/*
* Clone all relevant state from a request that has been put on hold in
* the flush state machine into the preallocated flush request that hangs
* off the request queue.
*
* For a driver the flush request should be invisible, that's why we are
* impersonating the original request here.
*/
void blk_mq_clone_flush_request(struct request *flush_rq,
struct request *orig_rq)
{
struct blk_mq_hw_ctx *hctx =
orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
flush_rq->mq_ctx = orig_rq->mq_ctx;
flush_rq->tag = orig_rq->tag;
memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
hctx->cmd_size);
}
inline void __blk_mq_end_io(struct request *rq, int error)
} else {
if (unlikely(blk_bidi_rq(rq)))
blk_mq_free_request(rq->next_rq);
blk_mq_free_request(rq);
EXPORT_SYMBOL(__blk_mq_end_io);
void blk_mq_end_io(struct request *rq, int error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_io(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_io);
static void __blk_mq_complete_request_remote(void *data)
struct request *rq = data;
static void blk_mq_ipi_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
}
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
shared = cpus_share_cache(cpu, ctx->cpu);
if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
Frederic Weisbecker
committed
smp_call_function_single_async(ctx->cpu, &rq->csd);
void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (!q->softirq_done_fn)
blk_mq_end_io(rq, rq->errors);
else
blk_mq_ipi_complete_request(rq);
}
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
void blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (unlikely(blk_should_fake_timeout(q)))
if (!blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
static void blk_mq_start_request(struct request *rq, bool last)
{
struct request_queue *q = rq->q;
trace_block_rq_issue(q, rq);
if (unlikely(blk_bidi_rq(rq)))
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
/*
* Mark us as started and clear complete. Complete might have been
* set if requeue raced with timeout, which then marked it as
* complete. So be sure to clear complete again when we start
* the request, otherwise we'll ignore the completion event.
*/
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
* Make sure space for the drain appears. We know we can do
* this because max_hw_segments has been adjusted to be one
* fewer than the device can handle.
*/
rq->nr_phys_segments++;
}
/*
* Flag the last request in the series so that drivers know when IO
* should be kicked off, if they don't do it on a per-request basis.
*
* Note: the flag isn't the only condition drivers should do kick off.
* If drive is busy, the last request might not have the bit set.
*/
if (last)
rq->cmd_flags |= REQ_END;
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
trace_block_rq_requeue(q, rq);
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
rq->cmd_flags &= ~REQ_END;
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
void blk_mq_requeue_request(struct request *rq)
{
__blk_mq_requeue_request(rq);
blk_clear_rq_complete(rq);
BUG_ON(blk_queued_rq(rq));
blk_mq_add_to_requeue_list(rq, true);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, requeue_work);
LIST_HEAD(rq_list);
struct request *rq, *next;
unsigned long flags;
spin_lock_irqsave(&q->requeue_lock, flags);
list_splice_init(&q->requeue_list, &rq_list);
spin_unlock_irqrestore(&q->requeue_lock, flags);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->cmd_flags & REQ_SOFTBARRIER))
continue;
rq->cmd_flags &= ~REQ_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, false, false, false);
}
blk_mq_run_queues(q, false);
}
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
{
struct request_queue *q = rq->q;
unsigned long flags;
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
* request head insertation from the workqueue.
*/
BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
rq->cmd_flags |= REQ_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
}
EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_schedule_work(&q->requeue_work);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
static inline bool is_flush_request(struct request *rq, unsigned int tag)
return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
rq->q->flush_rq->tag == tag);
}
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
struct request *rq = tags->rqs[tag];
if (!is_flush_request(rq, tag))
return rq;
return rq->q->flush_rq;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
struct blk_mq_timeout_data {
struct blk_mq_hw_ctx *hctx;
unsigned long *next;
unsigned int *next_set;
};
static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
{
struct blk_mq_timeout_data *data = __data;
struct blk_mq_hw_ctx *hctx = data->hctx;
unsigned int tag;
/* It may not be in flight yet (this is where
* the REQ_ATOMIC_STARTED flag comes in). The requests are
* statically allocated, so we know it's always safe to access the
* memory associated with a bit offset into ->rqs[].
*/
tag = 0;
do {
struct request *rq;
tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
if (tag >= hctx->tags->nr_tags)
rq = blk_mq_tag_to_rq(hctx->tags, tag++);
if (rq->q != hctx->queue)
continue;
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
continue;
blk_rq_check_expired(rq, data->next, data->next_set);
} while (1);
}
static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
unsigned long *next,
unsigned int *next_set)
{
struct blk_mq_timeout_data data = {
.hctx = hctx,
.next = next,
.next_set = next_set,
};
/*
* Ask the tagging code to iterate busy requests, so we can
* check them for timeout.
*/
blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
}
static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
{
struct request_queue *q = rq->q;
/*
* We know that complete is set at this point. If STARTED isn't set
* anymore, then the request isn't active and the "timeout" should
* just be ignored. This can happen due to the bitflag ordering.
* Timeout first checks if STARTED is set, and if it is, assumes
* the request is active. But if we race with completion, then
* we both flags will get cleared. So check here again, and ignore
* a timeout event with a request that isn't active.
*/
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return BLK_EH_NOT_HANDLED;
if (!q->mq_ops->timeout)
return BLK_EH_RESET_TIMER;
return q->mq_ops->timeout(rq);
}
static void blk_mq_rq_timer(unsigned long data)
{
struct request_queue *q = (struct request_queue *) data;
struct blk_mq_hw_ctx *hctx;
unsigned long next = 0;
int i, next_set = 0;
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If not software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!hctx->nr_ctx || !hctx->tags)
continue;
blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
if (next_set) {
next = blk_rq_timeout(round_jiffies_up(next));
mod_timer(&q->timeout, next);
} else {
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_tag_idle(hctx);
}
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
}
/*
* Reverse check our software queue for entries that we could potentially
* merge with. Currently includes a hand-wavy stop count of 8, to not spend
* too much time checking for merges.
*/
static bool blk_mq_attempt_merge(struct request_queue *q,
struct blk_mq_ctx *ctx, struct bio *bio)
{
struct request *rq;
int checked = 8;
list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
int el_ret;
if (!checked--)
break;
if (!blk_rq_merge_ok(rq, bio))
continue;
el_ret = blk_try_merge(rq, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, rq, bio)) {
ctx->rq_merged++;
return true;
}
break;
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, rq, bio)) {
ctx->rq_merged++;
return true;
}
break;
}
}
return false;
}
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
/*
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct blk_mq_ctx *ctx;
int i;
for (i = 0; i < hctx->ctx_map.map_size; i++) {
struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
unsigned int off, bit;
if (!bm->word)
continue;
bit = 0;
off = i * hctx->ctx_map.bits_per_word;
do {
bit = find_next_bit(&bm->word, bm->depth, bit);
if (bit >= bm->depth)
break;
ctx = hctx->ctxs[bit + off];
clear_bit(bit, &bm->word);
spin_lock(&ctx->lock);
list_splice_tail_init(&ctx->rq_list, list);
spin_unlock(&ctx->lock);
bit++;
} while (1);
}
}
/*
* Run this hardware queue, pulling any software queues mapped to it in.
* Note that this function currently has various problems around ordering
* of IO. In particular, we'd like FIFO behaviour on handling existing
* items on the hctx->dispatch list. Ignore that for now.
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct request *rq;
LIST_HEAD(rq_list);
WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;
hctx->run++;
/*
* Touch any software queue that has pending entries.
*/
flush_busy_ctxs(hctx, &rq_list);
/*
* If we have previous entries on our dispatch list, grab them
* and stuff them at the front for more fair dispatch.
*/
if (!list_empty_careful(&hctx->dispatch)) {
spin_lock(&hctx->lock);
if (!list_empty(&hctx->dispatch))
list_splice_init(&hctx->dispatch, &rq_list);
spin_unlock(&hctx->lock);
}
/*
* Now process all the entries, sending them to the driver.
*/
while (!list_empty(&rq_list)) {
int ret;
rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_start_request(rq, list_empty(&rq_list));
ret = q->mq_ops->queue_rq(hctx, rq);
switch (ret) {
case BLK_MQ_RQ_QUEUE_OK:
queued++;
continue;
case BLK_MQ_RQ_QUEUE_BUSY:
list_add(&rq->queuelist, &rq_list);
break;
default:
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
Christoph Hellwig
committed
rq->errors = -EIO;
blk_mq_end_io(rq, rq->errors);
break;
}
if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break;
}
if (!queued)
hctx->dispatched[0]++;
else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
hctx->dispatched[ilog2(queued) + 1]++;
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
if (!list_empty(&rq_list)) {
spin_lock(&hctx->lock);
list_splice(&rq_list, &hctx->dispatch);
spin_unlock(&hctx->lock);
}
}
/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
* BLK_MQ_CPU_WORK_BATCH queued items.
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
int cpu = hctx->next_cpu;
if (--hctx->next_cpu_batch <= 0) {
int next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
return cpu;
}
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
__blk_mq_run_hw_queue(hctx);
else if (hctx->queue->nr_hw_queues == 1)
kblockd_schedule_delayed_work(&hctx->run_work, 0);
else {
unsigned int cpu;
cpu = blk_mq_hctx_next_cpu(hctx);
kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
}
void blk_mq_run_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if ((!blk_mq_hctx_has_pending(hctx) &&
list_empty_careful(&hctx->dispatch)) ||
test_bit(BLK_MQ_S_STOPPED, &hctx->state))
preempt_disable();
blk_mq_run_hw_queue(hctx, async);
preempt_enable();
}
}
EXPORT_SYMBOL(blk_mq_run_queues);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_delayed_work(&hctx->run_work);
cancel_delayed_work(&hctx->delay_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
preempt_disable();
blk_mq_run_hw_queue(hctx, false);
preempt_enable();
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
continue;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
preempt_disable();
blk_mq_run_hw_queue(hctx, async);
preempt_enable();
}
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
__blk_mq_run_hw_queue(hctx);
}
static void blk_mq_delay_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
__blk_mq_run_hw_queue(hctx);
}
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
unsigned long tmo = msecs_to_jiffies(msecs);
if (hctx->queue->nr_hw_queues == 1)
kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
else {
unsigned int cpu;
cpu = blk_mq_hctx_next_cpu(hctx);
kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
}
}
EXPORT_SYMBOL(blk_mq_delay_queue);
static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
struct request *rq, bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
trace_block_rq_insert(hctx->queue, rq);
if (at_head)
list_add(&rq->queuelist, &ctx->rq_list);
else
list_add_tail(&rq->queuelist, &ctx->rq_list);
blk_mq_hctx_mark_pending(hctx, ctx);
}
void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
bool async)
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
current_ctx = blk_mq_get_ctx(q);
if (!cpu_online(ctx->cpu))
rq->mq_ctx = ctx = current_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) &&
!(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
blk_insert_flush(rq);
} else {
spin_lock(&ctx->lock);
__blk_mq_insert_request(hctx, rq, at_head);
spin_unlock(&ctx->lock);
}
if (run_queue)
blk_mq_run_hw_queue(hctx, async);
blk_mq_put_ctx(current_ctx);
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
}
static void blk_mq_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list,
int depth,
bool from_schedule)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *current_ctx;
trace_block_unplug(q, depth, !from_schedule);
current_ctx = blk_mq_get_ctx(q);
if (!cpu_online(ctx->cpu))
ctx = current_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
spin_lock(&ctx->lock);
while (!list_empty(list)) {
struct request *rq;
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
rq->mq_ctx = ctx;
__blk_mq_insert_request(hctx, rq, false);
}
spin_unlock(&ctx->lock);