Newer
Older
// SPDX-License-Identifier: GPL-2.0
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
Ingo Molnar
committed
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-pm.h"
#include "blk-mq-sched.h"
static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, sectors, bucket;
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
sbitmap_clear_bit(&hctx->ctx_map, bit);
struct mq_inflight {
struct hd_struct *part;
unsigned int inflight[2];
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
if (rq->part == mi->part)
mi->inflight[rq_data_dir(rq)]++;
return true;
unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
return mi.inflight[0] + mi.inflight[1];
}
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
inflight[0] = mi.inflight[0];
inflight[1] = mi.inflight[1];
void blk_freeze_queue_start(struct request_queue *q)
mutex_lock(&q->mq_freeze_lock);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
blk_mq_run_hw_queues(q, false);
} else {
mutex_unlock(&q->mq_freeze_lock);
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_freeze_queue(struct request_queue *q)
/*
* In the !blk_mq case we are only calling this to kill the
* q_usage_counter, otherwise this increases the freeze depth
* and waits for it to return to zero. For this reason there is
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_mq_freeze_queue_wait(q);
}
void blk_mq_freeze_queue(struct request_queue *q)
{
/*
* ...just an alias to keep freeze and unfreeze actions balanced
* in the blk_mq_* namespace
*/
blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void blk_mq_unfreeze_queue(struct request_queue *q)
mutex_lock(&q->mq_freeze_lock);
q->mq_freeze_depth--;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
if (!q->mq_freeze_depth) {
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
mutex_unlock(&q->mq_freeze_lock);
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
* callback function is invoked. Once this function is returned, we make
* sure no dispatch can happen until the queue is unquiesced via
* blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
bool rcu = false;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(hctx->srcu);
else
rcu = true;
}
if (rcu)
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/*
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
* @q: request queue.
*
* This function recovers queue into the state before quiescing
* which is done by blk_mq_quiesce_queue.
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
/* dispatch requests which are inserted during quiescing */
blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
* Only need start/end time stamping if we have iostat or
* blk stats enabled, or using an IO scheduler.
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
unsigned int tag, u64 alloc_time_ns)
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->internal_tag = tag;
} else {
if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->rq_flags = rq_flags;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
INIT_LIST_HEAD(&rq->queuelist);
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
blk_crypto_rq_set_defaults(rq);
/* tag was already set */
WRITE_ONCE(rq->deadline, 0);
rq->timeout = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
if (!op_is_flush(data->cmd_flags)) {
struct elevator_queue *e = data->q->elevator;
rq->elv.icq = NULL;
if (e && e->type->ops.prepare_request) {
if (e->type->icq_cache)
blk_mq_sched_assign_ioc(rq);
e->type->ops.prepare_request(rq);
rq->rq_flags |= RQF_ELVPRIV;
}
}
data->hctx->queued++;
static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
struct request_queue *q = data->q;
struct elevator_queue *e = q->elevator;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL;
/*
* Flush requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
* limiting, as it isn't useful.
if (!op_is_flush(data->cmd_flags) &&
e->type->ops.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.limit_depth(data->cmd_flags, data);
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->flags & BLK_MQ_REQ_INTERNAL))
blk_mq_tag_busy(data->hctx);
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
*/
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
* off the the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
}
return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
};
struct request *rq;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = __blk_mq_alloc_request(&data);
goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
};
u64 alloc_time_ns = 0;
unsigned int cpu;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags);
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
data.flags |= BLK_MQ_REQ_INTERNAL;
else
blk_mq_tag_busy(data.hctx);
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->rq_flags & RQF_ELVPRIV) {
if (e && e->type->ops.finish_request)
e->type->ops.finish_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
u64 now = 0;
if (blk_mq_need_time_stamp(rq))
now = ktime_get_ns();
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq, now);
if (rq->internal_tag != BLK_MQ_NO_TAG)
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
rq_qos_done(rq->q, rq);
blk_mq_free_request(rq);
EXPORT_SYMBOL(__blk_mq_end_request);
void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
EXPORT_SYMBOL(blk_mq_end_request);
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
/*
* Softirq action handler - move entries to local list and loop over them
* while passing them to the queue registered handler.
*/
static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = this_cpu_ptr(&blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {
struct request *rq;
rq = list_entry(local_list.next, struct request, ipi_list);
list_del_init(&rq->ipi_list);
rq->q->mq_ops->complete(rq);
}
}
#ifdef CONFIG_SMP
static void trigger_softirq(void *data)
{
struct request *rq = data;
struct list_head *list;
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
}
/*
* Setup and invoke a run of 'trigger_softirq' on the given cpu.
*/
static int raise_blk_irq(int cpu, struct request *rq)
{
if (cpu_online(cpu)) {
call_single_data_t *data = &rq->csd;
data->func = trigger_softirq;
data->info = rq;
data->flags = 0;
smp_call_function_single_async(cpu, data);
return 0;
}
return 1;
}
#else /* CONFIG_SMP */
static int raise_blk_irq(int cpu, struct request *rq)
{
return 1;
}
#endif
static int blk_softirq_cpu_dead(unsigned int cpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_done, cpu),
this_cpu_ptr(&blk_cpu_done));
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_enable();
return 0;
}
static void __blk_complete_request(struct request *req)
{
struct request_queue *q = req->q;
int cpu, ccpu = req->mq_ctx->cpu;
unsigned long flags;
bool shared = false;
BUG_ON(!q->mq_ops->complete);
local_irq_save(flags);
cpu = smp_processor_id();
/*
* Select completion CPU
*/
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
shared = cpus_share_cache(cpu, ccpu);
} else
ccpu = cpu;
/*
* If current CPU and requested CPU share a cache, run the softirq on
* the current CPU. One might concern this is just like
* QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
* running in interrupt handler, and currently I/O controller doesn't
* support multiple interrupts, so current CPU is unique actually. This
* avoids IPI sending from current CPU to the first CPU of a group.
*/
if (ccpu == cpu || shared) {
struct list_head *list;
do_local:
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&req->ipi_list, list);
/*
* if the list only contains our just added request,
* signal a raise of the softirq. If there are already
* entries there, someone already raised the irq but it
* hasn't run yet.
*/
if (list->next == &req->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
} else if (raise_blk_irq(ccpu, req))
goto do_local;
local_irq_restore(flags);
}
static void __blk_mq_complete_request_remote(void *data)
struct request *rq = data;
/**
* blk_mq_force_complete_rq() - Force complete the request, bypassing any error
* injection that could drop the completion.
* @rq: Request to be force completed
*
* Drivers should use blk_mq_complete_request() to complete requests in their
* normal IO path. For timeout error recovery, drivers may call this forced
* completion routine after they've reclaimed timed out requests to bypass
* potentially subsequent fake timeouts.
*/
void blk_mq_force_complete_rq(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* Most of single queue controllers, there is only one irq vector
* for handling IO completion, and the only irq's affinity is set
* as all possible CPUs. On most of ARCHs, this affinity means the
* irq is handled on one specific CPU.
*
* So complete IO reqeust in softirq context in case of single queue
* for not degrading IO performance by irqsoff latency.
*/
__blk_complete_request(rq);
return;
}
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
if ((rq->cmd_flags & REQ_HIPRI) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
shared = cpus_share_cache(cpu, ctx->cpu);
if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
Frederic Weisbecker
committed
smp_call_function_single_async(ctx->cpu, &rq->csd);
EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
{
if (!(hctx->flags & BLK_MQ_F_BLOCKING))
rcu_read_unlock();
else
srcu_read_unlock(hctx->srcu, srcu_idx);
}
static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
__acquires(hctx->srcu)
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
/* shut up gcc false positive */
*srcu_idx = 0;
*srcu_idx = srcu_read_lock(hctx->srcu);
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
bool blk_mq_complete_request(struct request *rq)
if (unlikely(blk_should_fake_timeout(rq->q)))
blk_mq_force_complete_rq(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
*
* Function used by device drivers to notify the block layer that a request
* is going to be processed now, so blk layer can do proper initializations
* such as starting the timeout timer.
*/
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
#endif
EXPORT_SYMBOL(blk_mq_start_request);
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
continue;
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
/*
* If RQF_DONTPREP, rq has contained some driver specific
* data, so insert it to hctx dispatch list to avoid any
* merge.
*/
if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, false, false);
else
blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_sched_insert_request(rq, false, false, false);
blk_mq_run_hw_queues(q, false);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list)
{
struct request_queue *q = rq->q;
unsigned long flags;
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
* request head insertion from the workqueue.
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
rq->rq_flags |= RQF_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
}
void blk_mq_kick_requeue_list(struct request_queue *q)
{
Bart Van Assche
committed
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
unsigned long msecs)
{
Bart Van Assche
committed
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag < tags->nr_tags) {
prefetch(tags->rqs[tag]);
return tags->rqs[tag];
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
* If we find a request that is inflight and the queue matches,
* we know the queue is busy. Return false to stop the iteration.
if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
bool *busy = priv;
*busy = true;
return false;
}
return true;
}
bool blk_mq_queue_inflight(struct request_queue *q)
{
bool busy = false;
blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
ret = req->q->mq_ops->timeout(req, reserved);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
return false;
if (rq->rq_flags & RQF_TIMED_OUT)