Newer
Older
// SPDX-License-Identifier: GPL-2.0
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
Ingo Molnar
committed
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-pm.h"
#include "blk-mq-sched.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, sectors, bucket;
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
#define BLK_QC_T_SHIFT 16
#define BLK_QC_T_INTERNAL (1U << 31)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
return xa_load(&q->hctx_table,
(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
blk_qc_t qc)
{
unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
if (qc & BLK_QC_T_INTERNAL)
return blk_mq_tag_to_rq(hctx->sched_tags, tag);
return blk_mq_tag_to_rq(hctx->tags, tag);
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
(rq->tag != -1 ?
rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
}
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
sbitmap_clear_bit(&hctx->ctx_map, bit);
struct block_device *part;
unsigned int inflight[2];
static bool blk_mq_check_inflight(struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
if ((!mi->part->bd_partno || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part)
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
return mi.inflight[0] + mi.inflight[1];
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2])
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
inflight[0] = mi.inflight[0];
inflight[1] = mi.inflight[1];
void blk_freeze_queue_start(struct request_queue *q)
mutex_lock(&q->mq_freeze_lock);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
blk_mq_run_hw_queues(q, false);
} else {
mutex_unlock(&q->mq_freeze_lock);
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_freeze_queue(struct request_queue *q)
/*
* In the !blk_mq case we are only calling this to kill the
* q_usage_counter, otherwise this increases the freeze depth
* and waits for it to return to zero. For this reason there is
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_mq_freeze_queue_wait(q);
}
void blk_mq_freeze_queue(struct request_queue *q)
{
/*
* ...just an alias to keep freeze and unfreeze actions balanced
* in the blk_mq_* namespace
*/
blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
q->mq_freeze_depth--;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
if (!q->mq_freeze_depth) {
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
mutex_unlock(&q->mq_freeze_lock);
void blk_mq_unfreeze_queue(struct request_queue *q)
{
__blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->queue_lock, flags);
if (!q->quiesce_depth++)
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
* Note: it is driver's responsibility for making sure that quiesce has
* been started.
void blk_mq_wait_quiesce_done(struct request_queue *q)
if (blk_queue_has_srcu(q))
synchronize_srcu(q->srcu);
else
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
/**
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
* callback function is invoked. Once this function is returned, we make
* sure no dispatch can happen until the queue is unquiesced via
* blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
blk_mq_wait_quiesce_done(q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/*
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
* @q: request queue.
*
* This function recovers queue into the state before quiescing
* which is done by blk_mq_quiesce_queue.
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
unsigned long flags;
bool run_queue = false;
spin_lock_irqsave(&q->queue_lock, flags);
if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
;
} else if (!--q->quiesce_depth) {
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
run_queue = true;
}
spin_unlock_irqrestore(&q->queue_lock, flags);
/* dispatch requests which are inserted during quiescing */
if (run_queue)
blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
void blk_rq_init(struct request_queue *q, struct request *rq)
{
memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
struct blk_mq_ctx *ctx = data->ctx;
struct blk_mq_hw_ctx *hctx = data->hctx;
struct request_queue *q = data->q;
struct request *rq = tags->static_rqs[tag];
rq->q = q;
rq->mq_ctx = ctx;
rq->mq_hctx = hctx;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PM)
data->rq_flags |= RQF_PM;
if (blk_queue_io_stat(q))
data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
if (!(data->rq_flags & RQF_ELV)) {
rq->internal_tag = BLK_MQ_NO_TAG;
} else {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
rq->end_io = NULL;
rq->end_io_data = NULL;
blk_crypto_rq_set_defaults(rq);
INIT_LIST_HEAD(&rq->queuelist);
/* tag was already set */
WRITE_ONCE(rq->deadline, 0);
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = data->q->elevator;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
if (!op_is_flush(data->cmd_flags) &&
e->type->ops.prepare_request) {
e->type->ops.prepare_request(rq);
rq->rq_flags |= RQF_ELVPRIV;
}
}
static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
u64 alloc_time_ns)
{
unsigned int tag, tag_offset;
struct blk_mq_tags *tags;
tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
if (unlikely(!tag_mask))
tags = blk_mq_tags_from_data(data);
for (i = 0; tag_mask; i++) {
if (!(tag_mask & (1UL << i)))
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
rq_list_add(data->cached_rq, rq);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
return rq_list_pop(data->cached_rq);
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
struct request_queue *q = data->q;
struct request *rq;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (q->elevator) {
struct elevator_queue *e = q->elevator;
data->rq_flags |= RQF_ELV;
* Flush/passthrough requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
* limiting, as it isn't useful.
if (!op_is_flush(data->cmd_flags) &&
!blk_op_is_passthrough(data->cmd_flags) &&
e->type->ops.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.limit_depth(data->cmd_flags, data);
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->rq_flags & RQF_ELV))
blk_mq_tag_busy(data->hctx);
/*
* Try batched alloc if we want more than 1 tag.
*/
if (data->nr_tags > 1) {
rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
if (rq)
return rq;
data->nr_tags = 1;
}
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
*/
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
/*
* Give up the CPU and sleep for a random short time to
* ensure that thread using a realtime scheduling class
* are migrated off the CPU, and thus off the hctx that
* is going away.
*/
msleep(3);
goto retry;
}
return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
alloc_time_ns);
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
.nr_tags = 1,
struct request *rq;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = __blk_mq_alloc_requests(&data);
goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
.nr_tags = 1,
u64 alloc_time_ns = 0;
unsigned int cpu;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags);
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
blk_mq_tag_busy(data.hctx);
else
data.rq_flags |= RQF_ELV;
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
alloc_time_ns);
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if ((rq->rq_flags & RQF_ELVPRIV) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
if (rq->rq_flags & RQF_MQ_INFLIGHT)
__blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->disk->bdi);
if (req_ref_put_and_test(rq))
EXPORT_SYMBOL_GPL(blk_mq_free_request);
void blk_mq_free_plug_rqs(struct blk_plug *plug)
while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
blk_mq_free_request(rq);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
{
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
rq->q->disk ? rq->q->disk->disk_name : "?",
(unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
printk(KERN_INFO " bio %p, biotail %p, len %u\n",
rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);
static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, blk_status_t error)
{
} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
/*
* Partial zone append completions cannot be supported as the
* BIO fragments may end up not being written sequentially.
*/
if (bio->bi_iter.bi_size != nbytes)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_iter.bi_sector = rq->__sector;
}
bio_advance(bio, nbytes);
if (unlikely(rq->rq_flags & RQF_QUIET))
bio_set_flag(bio, BIO_QUIET);
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
part_stat_add(req->part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
static void blk_print_req_error(struct request *req, blk_status_t status)
{
printk_ratelimited(KERN_ERR
"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
"phys_seg %u prio class %u\n",
blk_status_to_str(status),
req->q->disk ? req->q->disk->disk_name : "?",
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
req->cmd_flags & ~REQ_OP_MASK,
req->nr_phys_segments,
IOPRIO_PRIO_CLASS(req->ioprio));
}
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
/*
* Fully end IO on a request. Does not support partial completions, or
* errors.
*/
static void blk_complete_request(struct request *req)
{
const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
int total_bytes = blk_rq_bytes(req);
struct bio *bio = req->bio;
trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
if (!bio)
return;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
req->q->integrity.profile->complete_fn(req, total_bytes);
#endif
blk_account_io_completion(req, total_bytes);
do {
struct bio *next = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
if (req_op(req) == REQ_OP_ZONE_APPEND)
bio->bi_iter.bi_sector = req->__sector;
if (!is_flush)
bio_endio(bio);
bio = next;
} while (bio);
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
* later.
*/
req->bio = NULL;
req->__data_len = 0;
}
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
/**
* blk_update_request - Complete multiple bytes without completing the request
* @req: the request being processed
* @error: block status code
* @nr_bytes: number of bytes to complete for @req
*
* Description:
* Ends I/O on a number of bytes attached to @req, but doesn't complete
* the request structure even if @req doesn't have leftover.
* If @req has leftover, sets it up for the next range of segments.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
* Note:
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
* except in the consistency check at the end of this function.
*
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
**/
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
if (!req->bio)
return false;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
error == BLK_STS_OK)
req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
blk_account_io_completion(req, nr_bytes);
total_bytes = 0;
while (req->bio) {
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
if (!nr_bytes)
break;
}
/*
* completely done
*/
if (!req->bio) {
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
* later.
*/
req->__data_len = 0;
return false;
}
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */
if (!blk_rq_is_passthrough(req))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */
if (req->rq_flags & RQF_MIXED_MERGE) {
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
}
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
/*
* If total number of sectors is less than the first segment
* size, something has gone terribly wrong.
*/
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
/* recalculate the number of segments */
req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
static void __blk_account_io_done(struct request *req, u64 now)
{
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
part_stat_unlock();
}
static inline void blk_account_io_done(struct request *req, u64 now)
{
/*
* Account IO completion. flush_rq isn't accounted as a
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
if (blk_do_io_stat(req) && req->part &&
!(req->rq_flags & RQF_FLUSH_SEQ))
__blk_account_io_done(req, now);
}
static void __blk_account_io_start(struct request *rq)
{
/*
* All non-passthrough requests are created from a bio with one
* exception: when a flush command that is part of a flush sequence
* generated by the state machine in blk-flush.c is cloned onto the
* lower device by dm-multipath we can get here without a bio.
*/
if (rq->bio)
rq->part = rq->bio->bi_bdev;
rq->part = rq->q->disk->part0;
part_stat_lock();
update_io_ticks(rq->part, jiffies, false);
part_stat_unlock();
}
static inline void blk_account_io_start(struct request *req)
{
if (blk_do_io_stat(req))
__blk_account_io_start(req);
}
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq, now);
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_mq_need_time_stamp(rq))
__blk_mq_end_request_acct(rq, ktime_get_ns());
rq_qos_done(rq->q, rq);
blk_mq_free_request(rq);
EXPORT_SYMBOL(__blk_mq_end_request);
void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
EXPORT_SYMBOL(blk_mq_end_request);
#define TAG_COMP_BATCH 32
static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
int *tag_array, int nr_tags)
{
struct request_queue *q = hctx->queue;
/*
* All requests should have been marked as RQF_MQ_INFLIGHT, so
* update hctx->nr_active in batch
*/
if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
__blk_mq_sub_active_requests(hctx, nr_tags);
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}
void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
int tags[TAG_COMP_BATCH], nr_tags = 0;
struct blk_mq_hw_ctx *cur_hctx = NULL;
struct request *rq;
u64 now = 0;
if (iob->need_ts)
now = ktime_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio);
prefetch(rq->rq_next);
if (iob->need_ts)
__blk_mq_end_request_acct(rq, now);
rq_qos_done(rq->q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (!req_ref_put_and_test(rq))
continue;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
if (cur_hctx)
blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
cur_hctx = rq->mq_hctx;
}
tags[nr_tags++] = rq->tag;
}
if (nr_tags)