Newer
Older
// SPDX-License-Identifier: GPL-2.0
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
Ingo Molnar
committed
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-pm.h"
#include "blk-mq-sched.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, sectors, bucket;
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
#define BLK_QC_T_SHIFT 16
#define BLK_QC_T_INTERNAL (1U << 31)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
}
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
blk_qc_t qc)
{
unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
if (qc & BLK_QC_T_INTERNAL)
return blk_mq_tag_to_rq(hctx->sched_tags, tag);
return blk_mq_tag_to_rq(hctx->tags, tag);
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
(rq->tag != -1 ?
rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
}
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
sbitmap_clear_bit(&hctx->ctx_map, bit);
struct block_device *part;
unsigned int inflight[2];
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
if ((!mi->part->bd_partno || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part)
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
return mi.inflight[0] + mi.inflight[1];
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2])
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
inflight[0] = mi.inflight[0];
inflight[1] = mi.inflight[1];
void blk_freeze_queue_start(struct request_queue *q)
mutex_lock(&q->mq_freeze_lock);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
blk_mq_run_hw_queues(q, false);
} else {
mutex_unlock(&q->mq_freeze_lock);
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_freeze_queue(struct request_queue *q)
/*
* In the !blk_mq case we are only calling this to kill the
* q_usage_counter, otherwise this increases the freeze depth
* and waits for it to return to zero. For this reason there is
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_mq_freeze_queue_wait(q);
}
void blk_mq_freeze_queue(struct request_queue *q)
{
/*
* ...just an alias to keep freeze and unfreeze actions balanced
* in the blk_mq_* namespace
*/
blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
q->mq_freeze_depth--;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
if (!q->mq_freeze_depth) {
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
mutex_unlock(&q->mq_freeze_lock);
void blk_mq_unfreeze_queue(struct request_queue *q)
{
__blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->queue_lock, flags);
if (!q->quiesce_depth++)
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
* Note: it is driver's responsibility for making sure that quiesce has
* been started.
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
bool rcu = false;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(hctx->srcu);
else
rcu = true;
}
if (rcu)
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
/**
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
* callback function is invoked. Once this function is returned, we make
* sure no dispatch can happen until the queue is unquiesced via
* blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
blk_mq_wait_quiesce_done(q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/*
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
* @q: request queue.
*
* This function recovers queue into the state before quiescing
* which is done by blk_mq_quiesce_queue.
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
unsigned long flags;
bool run_queue = false;
spin_lock_irqsave(&q->queue_lock, flags);
if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
;
} else if (!--q->quiesce_depth) {
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
run_queue = true;
}
spin_unlock_irqrestore(&q->queue_lock, flags);
/* dispatch requests which are inserted during quiescing */
if (run_queue)
blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
void blk_rq_init(struct request_queue *q, struct request *rq)
{
memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
struct blk_mq_ctx *ctx = data->ctx;
struct blk_mq_hw_ctx *hctx = data->hctx;
struct request_queue *q = data->q;
struct request *rq = tags->static_rqs[tag];
rq->q = q;
rq->mq_ctx = ctx;
rq->mq_hctx = hctx;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PM)
data->rq_flags |= RQF_PM;
if (blk_queue_io_stat(q))
data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
if (!(data->rq_flags & RQF_ELV)) {
rq->internal_tag = BLK_MQ_NO_TAG;
} else {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
rq->end_io = NULL;
rq->end_io_data = NULL;
blk_crypto_rq_set_defaults(rq);
INIT_LIST_HEAD(&rq->queuelist);
/* tag was already set */
WRITE_ONCE(rq->deadline, 0);
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = data->q->elevator;
rq->elv.icq = NULL;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
if (!op_is_flush(data->cmd_flags) &&
e->type->ops.prepare_request) {
if (e->type->icq_cache)
blk_mq_sched_assign_ioc(rq);
e->type->ops.prepare_request(rq);
rq->rq_flags |= RQF_ELVPRIV;
}
}
static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
u64 alloc_time_ns)
{
unsigned int tag, tag_offset;
struct blk_mq_tags *tags;
tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
if (unlikely(!tag_mask))
tags = blk_mq_tags_from_data(data);
for (i = 0; tag_mask; i++) {
if (!(tag_mask & (1UL << i)))
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
rq_list_add(data->cached_rq, rq);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
return rq_list_pop(data->cached_rq);
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
struct request_queue *q = data->q;
struct request *rq;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (q->elevator) {
struct elevator_queue *e = q->elevator;
data->rq_flags |= RQF_ELV;
* Flush/passthrough requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
* limiting, as it isn't useful.
if (!op_is_flush(data->cmd_flags) &&
!blk_op_is_passthrough(data->cmd_flags) &&
e->type->ops.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.limit_depth(data->cmd_flags, data);
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->rq_flags & RQF_ELV))
blk_mq_tag_busy(data->hctx);
/*
* Try batched alloc if we want more than 1 tag.
*/
if (data->nr_tags > 1) {
rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
if (rq)
return rq;
data->nr_tags = 1;
}
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
*/
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
/*
* Give up the CPU and sleep for a random short time to
* ensure that thread using a realtime scheduling class
* are migrated off the CPU, and thus off the hctx that
* is going away.
*/
msleep(3);
goto retry;
}
return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
alloc_time_ns);
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
.nr_tags = 1,
struct request *rq;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = __blk_mq_alloc_requests(&data);
goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
.nr_tags = 1,
u64 alloc_time_ns = 0;
unsigned int cpu;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags);
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
blk_mq_tag_busy(data.hctx);
else
data.rq_flags |= RQF_ELV;
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
alloc_time_ns);
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->rq_flags & RQF_ELVPRIV) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.finish_request)
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
if (rq->rq_flags & RQF_MQ_INFLIGHT)
__blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->disk->bdi);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
EXPORT_SYMBOL_GPL(blk_mq_free_request);
void blk_mq_free_plug_rqs(struct blk_plug *plug)
while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
blk_mq_free_request(rq);
}
static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, blk_status_t error)
{
} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
/*
* Partial zone append completions cannot be supported as the
* BIO fragments may end up not being written sequentially.
*/
if (bio->bi_iter.bi_size != nbytes)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_iter.bi_sector = rq->__sector;
}
bio_advance(bio, nbytes);
if (unlikely(rq->rq_flags & RQF_QUIET))
bio_set_flag(bio, BIO_QUIET);
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
part_stat_add(req->part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
/**
* blk_update_request - Complete multiple bytes without completing the request
* @req: the request being processed
* @error: block status code
* @nr_bytes: number of bytes to complete for @req
*
* Description:
* Ends I/O on a number of bytes attached to @req, but doesn't complete
* the request structure even if @req doesn't have leftover.
* If @req has leftover, sets it up for the next range of segments.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
* Note:
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
* except in the consistency check at the end of this function.
*
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
**/
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
if (!req->bio)
return false;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
error == BLK_STS_OK)
req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)))
blk_print_req_error(req, error);
blk_account_io_completion(req, nr_bytes);
total_bytes = 0;
while (req->bio) {
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
if (!nr_bytes)
break;
}
/*
* completely done
*/
if (!req->bio) {
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
* later.
*/
req->__data_len = 0;
return false;
}
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */
if (!blk_rq_is_passthrough(req))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */
if (req->rq_flags & RQF_MIXED_MERGE) {
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
}
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
/*
* If total number of sectors is less than the first segment
* size, something has gone terribly wrong.
*/
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
/* recalculate the number of segments */
req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq, now);
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_mq_need_time_stamp(rq))
__blk_mq_end_request_acct(rq, ktime_get_ns());
rq_qos_done(rq->q, rq);
blk_mq_free_request(rq);
EXPORT_SYMBOL(__blk_mq_end_request);
void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
EXPORT_SYMBOL(blk_mq_end_request);
#define TAG_COMP_BATCH 32
static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
int *tag_array, int nr_tags)
{
struct request_queue *q = hctx->queue;
/*
* All requests should have been marked as RQF_MQ_INFLIGHT, so
* update hctx->nr_active in batch
*/
if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
__blk_mq_sub_active_requests(hctx, nr_tags);
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}
void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
int tags[TAG_COMP_BATCH], nr_tags = 0;
struct blk_mq_hw_ctx *cur_hctx = NULL;
struct request *rq;
u64 now = 0;
if (iob->need_ts)
now = ktime_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio);
prefetch(rq->rq_next);
blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
if (iob->need_ts)
__blk_mq_end_request_acct(rq, now);
rq_qos_done(rq->q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (!refcount_dec_and_test(&rq->ref))
continue;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
if (cur_hctx)
blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
cur_hctx = rq->mq_hctx;
}
tags[nr_tags++] = rq->tag;
}
if (nr_tags)
blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
}
EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
static void blk_complete_reqs(struct llist_head *list)
struct llist_node *entry = llist_reverse_order(llist_del_all(list));
struct request *rq, *next;
llist_for_each_entry_safe(rq, next, entry, ipi_list)
rq->q->mq_ops->complete(rq);
static __latent_entropy void blk_done_softirq(struct softirq_action *h)
blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
static int blk_softirq_cpu_dead(unsigned int cpu)
{
blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
static void __blk_mq_complete_request_remote(void *data)
__raise_softirq_irqoff(BLOCK_SOFTIRQ);
static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
int cpu = raw_smp_processor_id();
if (!IS_ENABLED(CONFIG_SMP) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
return false;
/*
* With force threaded interrupts enabled, raising softirq from an SMP
* function call will always result in waking the ksoftirqd thread.
* This is probably worse than completing the request on a different
* cache domain.
*/
return false;
/* same CPU or cache domain? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
return cpu_online(rq->mq_ctx->cpu);
}
static void blk_mq_complete_send_ipi(struct request *rq)
{
struct llist_head *list;
unsigned int cpu;
cpu = rq->mq_ctx->cpu;
list = &per_cpu(blk_cpu_done, cpu);
if (llist_add(&rq->ipi_list, list)) {
INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
smp_call_function_single_async(cpu, &rq->csd);
}
}
static void blk_mq_raise_softirq(struct request *rq)
{
struct llist_head *list;
preempt_disable();
list = this_cpu_ptr(&blk_cpu_done);
if (llist_add(&rq->ipi_list, list))
raise_softirq(BLOCK_SOFTIRQ);
preempt_enable();
}
bool blk_mq_complete_request_remote(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
if (rq->cmd_flags & REQ_POLLED)
if (blk_mq_complete_need_ipi(rq)) {
blk_mq_complete_send_ipi(rq);
return true;
if (rq->q->nr_hw_queues == 1) {
blk_mq_raise_softirq(rq);
return true;
}
return false;