Newer
Older
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
Ingo Molnar
committed
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, bytes, bucket;
bytes = blk_rq_bytes(rq);
bucket = ddir + 2*(ilog2(bytes) - 9);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
struct mq_inflight {
struct hd_struct *part;
unsigned int *inflight;
};
static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
/*
* index[0] counts the specific partition that was asked for. index[1]
* counts the ones that are active on the whole device, so increment
* that if mi->part is indeed a partition, and not a whole device.
*/
if (rq->part == mi->part)
mi->inflight[0]++;
if (mi->part->partno)
mi->inflight[1]++;
}
void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
}
static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
if (rq->part == mi->part)
mi->inflight[rq_data_dir(rq)]++;
}
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
}
void blk_freeze_queue_start(struct request_queue *q)
freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
if (q->mq_ops)
blk_mq_run_hw_queues(q, false);
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_freeze_queue(struct request_queue *q)
/*
* In the !blk_mq case we are only calling this to kill the
* q_usage_counter, otherwise this increases the freeze depth
* and waits for it to return to zero. For this reason there is
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
if (!q->mq_ops)
blk_drain_queue(q);
blk_mq_freeze_queue_wait(q);
}
void blk_mq_freeze_queue(struct request_queue *q)
{
/*
* ...just an alias to keep freeze and unfreeze actions balanced
* in the blk_mq_* namespace
*/
blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void blk_mq_unfreeze_queue(struct request_queue *q)
freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
WARN_ON_ONCE(freeze_depth < 0);
if (!freeze_depth) {
percpu_ref_reinit(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
* callback function is invoked. Once this function is returned, we make
* sure no dispatch can happen until the queue is unquiesced via
* blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
bool rcu = false;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(hctx->srcu);
else
rcu = true;
}
if (rcu)
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/*
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
* @q: request queue.
*
* This function recovers queue into the state before quiescing
* which is done by blk_mq_quiesce_queue.
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
/* dispatch requests which are inserted during quiescing */
blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
{
return blk_mq_has_free_tags(hctx->tags);
}
EXPORT_SYMBOL(blk_mq_can_queue);
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
unsigned int tag, unsigned int op)
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->tag = -1;
rq->internal_tag = tag;
} else {
if (blk_mq_tag_busy(data->hctx)) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
rq->internal_tag = -1;
data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->rq_flags = rq_flags;
rq->cpu = -1;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
INIT_LIST_HEAD(&rq->queuelist);
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
rq->special = NULL;
/* tag was already set */
rq->extra_len = 0;
rq->__deadline = 0;
INIT_LIST_HEAD(&rq->timeout_list);
rq->timeout = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
rq->next_rq = NULL;
#ifdef CONFIG_BLK_CGROUP
rq->rl = NULL;
set_start_time_ns(rq);
rq->cgroup_io_start_time_ns = 0;
data->ctx->rq_dispatched[op_is_sync(op)]++;
return rq;
static struct request *blk_mq_get_request(struct request_queue *q,
struct bio *bio, unsigned int op,
struct blk_mq_alloc_data *data)
{
struct elevator_queue *e = q->elevator;
struct request *rq;
bool put_ctx_on_error = false;
blk_queue_enter_live(q);
data->q = q;
if (likely(!data->ctx)) {
data->ctx = blk_mq_get_ctx(q);
put_ctx_on_error = true;
}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
if (op & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL;
/*
* Flush requests are special and go directly to the
* dispatch list.
*/
if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
e->type->ops.mq.limit_depth(op, data);
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
if (put_ctx_on_error) {
blk_mq_put_ctx(data->ctx);
data->ctx = NULL;
}
blk_queue_exit(q);
return NULL;
rq = blk_mq_rq_ctx_init(data, tag, op);
if (!op_is_flush(op)) {
rq->elv.icq = NULL;
if (e && e->type->ops.mq.prepare_request) {
if (e->type->icq_cache && rq_ioc(bio))
blk_mq_sched_assign_ioc(rq, bio);
e->type->ops.mq.prepare_request(rq, bio);
rq->rq_flags |= RQF_ELVPRIV;
}
data->hctx->queued++;
return rq;
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
return ERR_PTR(-EWOULDBLOCK);
blk_mq_put_ctx(alloc_data.ctx);
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
struct blk_mq_alloc_data alloc_data = { .flags = flags };
unsigned int cpu;
int ret;
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags);
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
blk_queue_exit(q);
return ERR_PTR(-EXDEV);
cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
if (!rq)
return ERR_PTR(-EWOULDBLOCK);
return rq;
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
const int sched_tag = rq->internal_tag;
if (rq->rq_flags & RQF_ELVPRIV) {
if (e && e->type->ops.mq.finish_request)
e->type->ops.mq.finish_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
wbt_done(q->rq_wb, rq);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
wbt_done(rq->q->rq_wb, rq);
} else {
if (unlikely(blk_bidi_rq(rq)))
blk_mq_free_request(rq->next_rq);
blk_mq_free_request(rq);
EXPORT_SYMBOL(__blk_mq_end_request);
void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
EXPORT_SYMBOL(blk_mq_end_request);
static void __blk_mq_complete_request_remote(void *data)
struct request *rq = data;
static void __blk_mq_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq);
}
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
}
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
shared = cpus_share_cache(cpu, ctx->cpu);
if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
Frederic Weisbecker
committed
smp_call_function_single_async(ctx->cpu, &rq->csd);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
{
if (!(hctx->flags & BLK_MQ_F_BLOCKING))
rcu_read_unlock();
else
srcu_read_unlock(hctx->srcu, srcu_idx);
}
static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
__acquires(hctx->srcu)
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
/* shut up gcc false positive */
*srcu_idx = 0;
*srcu_idx = srcu_read_lock(hctx->srcu);
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
{
unsigned long flags;
/*
* blk_mq_rq_aborted_gstate() is used from the completion path and
* can thus be called from irq context. u64_stats_fetch in the
* middle of update on the same CPU leads to lockup. Disable irq
* while updating.
*/
local_irq_save(flags);
u64_stats_update_begin(&rq->aborted_gstate_sync);
rq->aborted_gstate = gstate;
u64_stats_update_end(&rq->aborted_gstate_sync);
local_irq_restore(flags);
}
static u64 blk_mq_rq_aborted_gstate(struct request *rq)
{
unsigned int start;
u64 aborted_gstate;
do {
start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
aborted_gstate = rq->aborted_gstate;
} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
return aborted_gstate;
}
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
void blk_mq_complete_request(struct request *rq)
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
int srcu_idx;
if (unlikely(blk_should_fake_timeout(q)))
/*
* If @rq->aborted_gstate equals the current instance, timeout is
* claiming @rq and we lost. This is synchronized through
* hctx_lock(). See blk_mq_timeout_work() for details.
*
* Completion path never blocks and we can directly use RCU here
* instead of hctx_lock() which can be either RCU or SRCU.
* However, that would complicate paths which want to synchronize
* against us. Let stay in sync with the issue path so that
* hctx_lock() covers both issue and completion paths.
*/
if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
__blk_mq_complete_request(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
int blk_mq_request_started(struct request *rq)
{
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
blk_mq_sched_started_request(rq);
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
rq->throtl_size = blk_rq_sectors(rq);
#endif
rq->rq_flags |= RQF_STATS;
wbt_issue(q->rq_wb, rq);
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
* Mark @rq in-flight which also advances the generation number,
* and register for timeout. Protect with a seqcount to allow the
* timeout path to read both @rq->gstate and @rq->deadline
* coherently.
* This is the only place where a request is marked in-flight. If
* the timeout path reads an in-flight @rq->gstate, the
* @rq->deadline it reads together under @rq->gstate_seq is
* guaranteed to be the matching one.
preempt_disable();
write_seqcount_begin(&rq->gstate_seq);
blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
blk_add_timer(rq);
write_seqcount_end(&rq->gstate_seq);
preempt_enable();
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
* Make sure space for the drain appears. We know we can do
* this because max_hw_segments has been adjusted to be one
* fewer than the device can handle.
*/
rq->nr_phys_segments++;
}
EXPORT_SYMBOL(blk_mq_start_request);
* When we reach here because queue is busy, it's safe to change the state
* to IDLE without checking @rq->aborted_gstate because we should still be
* holding the RCU read lock and thus protected against timeout.
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, rq);
blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->rq_flags & RQF_SOFTBARRIER))
continue;
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_sched_insert_request(rq, false, false, false);
blk_mq_run_hw_queues(q, false);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list)
{
struct request_queue *q = rq->q;
unsigned long flags;
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
* request head insertion from the workqueue.
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
rq->rq_flags |= RQF_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
Bart Van Assche
committed
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
unsigned long msecs)
{
Bart Van Assche
committed
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag < tags->nr_tags) {
prefetch(tags->rqs[tag]);
return tags->rqs[tag];
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
struct blk_mq_timeout_data {
unsigned long next;
unsigned int next_set;
unsigned int nr_expired;
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
ret = ops->timeout(req, reserved);
switch (ret) {
case BLK_EH_HANDLED:
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
/*
* As nothing prevents from completion happening while
* ->aborted_gstate is set, this may lead to ignored
* completions and further spurious timeouts.
*/
blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
break;
case BLK_EH_NOT_HANDLED:
break;
default:
printk(KERN_ERR "block: bad eh return: %d\n", ret);
break;
}
static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct blk_mq_timeout_data *data = priv;
unsigned long gstate, deadline;
int start;
might_sleep();
/* read coherent snapshots of @rq->state_gen and @rq->deadline */
while (true) {
start = read_seqcount_begin(&rq->gstate_seq);
gstate = READ_ONCE(rq->gstate);
deadline = blk_rq_deadline(rq);
if (!read_seqcount_retry(&rq->gstate_seq, start))
break;
cond_resched();
}
/* if in-flight && overdue, mark for abortion */
if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
time_after_eq(jiffies, deadline)) {
blk_mq_rq_update_aborted_gstate(rq, gstate);
data->nr_expired++;
hctx->nr_expired++;
} else if (!data->next_set || time_after(data->next, deadline)) {
data->next = deadline;
static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
/*
* We marked @rq->aborted_gstate and waited for RCU. If there were
* completions that we lost to, they would have finished and
* updated @rq->gstate by now; otherwise, the completion path is
* now guaranteed to see @rq->aborted_gstate and yield. If
* @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
*/
if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
READ_ONCE(rq->gstate) == rq->aborted_gstate)
blk_mq_rq_timed_out(rq, reserved);
}
static void blk_mq_timeout_work(struct work_struct *work)
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
.nr_expired = 0,
struct blk_mq_hw_ctx *hctx;
int i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
* completion, since the timeout code would not be able to
* acquire the queue reference here.
*
* That's why we don't use blk_queue_enter here; instead, we use
* percpu_ref_tryget directly, because we need to be able to
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
* blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
/* scan for the expired ones and set their ->aborted_gstate */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.nr_expired) {
bool has_rcu = false;
/*
* Wait till everyone sees ->aborted_gstate. The
* sequential waits for SRCUs aren't ideal. If this ever
* becomes a problem, we can add per-hw_ctx rcu_head and
* wait in parallel.
*/
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->nr_expired)
continue;
if (!(hctx->flags & BLK_MQ_F_BLOCKING))
has_rcu = true;
else
synchronize_srcu(hctx->srcu);
hctx->nr_expired = 0;
}
if (has_rcu)
synchronize_rcu();
/* terminate the ones we won */
blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
}
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
/*
* Request timeouts are handled as a forward rolling timer. If
* we end up here it means that no requests are pending and
* also that no request has been pending for a while. Mark
* each hctx as idle.
*/
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
struct flush_busy_ctx_data {
struct blk_mq_hw_ctx *hctx;
struct list_head *list;
};
static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
struct flush_busy_ctx_data *flush_data = data;
struct blk_mq_hw_ctx *hctx = flush_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];