blk-mq.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>

#include <trace/events/block.h>

#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);

static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

static int blk_mq_poll_stats_bkt(const struct request *rq)
{
	int ddir, sectors, bucket;

	ddir = rq_data_dir(rq);
	sectors = blk_rq_stats_sectors(rq);

	bucket = ddir + 2 * ilog2(sectors);

	if (bucket < 0)
		return -1;
	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

	return bucket;
}

#define BLK_QC_T_SHIFT		16
#define BLK_QC_T_INTERNAL	(1U << 31)

static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
		blk_qc_t qc)
{
	return xa_load(&q->hctx_table,
			(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}

static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
		blk_qc_t qc)
{
	unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);

	if (qc & BLK_QC_T_INTERNAL)
		return blk_mq_tag_to_rq(hctx->sched_tags, tag);
	return blk_mq_tag_to_rq(hctx->tags, tag);
}

static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
	return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
		(rq->tag != -1 ?
		 rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
}

/*
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
 */
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
	return !list_empty_careful(&hctx->dispatch) ||
		sbitmap_any_bit_set(&hctx->ctx_map) ||
			blk_mq_sched_has_work(hctx);
}

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				     struct blk_mq_ctx *ctx)
{
	const int bit = ctx->index_hw[hctx->type];

	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
		sbitmap_set_bit(&hctx->ctx_map, bit);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				      struct blk_mq_ctx *ctx)
{
	const int bit = ctx->index_hw[hctx->type];

	sbitmap_clear_bit(&hctx->ctx_map, bit);
}

struct mq_inflight {
	struct block_device *part;
	unsigned int inflight[2];
};

static bool blk_mq_check_inflight(struct request *rq, void *priv,
				  bool reserved)
{
	struct mq_inflight *mi = priv;

	if ((!mi->part->bd_partno || rq->part == mi->part) &&
	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
		mi->inflight[rq_data_dir(rq)]++;

	return true;
}

unsigned int blk_mq_in_flight(struct request_queue *q,
		struct block_device *part)
{
	struct mq_inflight mi = { .part = part };

	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);

	return mi.inflight[0] + mi.inflight[1];
}

void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
		unsigned int inflight[2])
{
	struct mq_inflight mi = { .part = part };

	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	inflight[0] = mi.inflight[0];
	inflight[1] = mi.inflight[1];
}

void blk_freeze_queue_start(struct request_queue *q)
{
	mutex_lock(&q->mq_freeze_lock);
	if (++q->mq_freeze_depth == 1) {
		percpu_ref_kill(&q->q_usage_counter);
		mutex_unlock(&q->mq_freeze_lock);
		if (queue_is_mq(q))
			blk_mq_run_hw_queues(q, false);
	} else {
		mutex_unlock(&q->mq_freeze_lock);
	}
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);

void blk_mq_freeze_queue_wait(struct request_queue *q)
{
	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);

int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
				     unsigned long timeout)
{
	return wait_event_timeout(q->mq_freeze_wq,
					percpu_ref_is_zero(&q->q_usage_counter),
					timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);

/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
void blk_freeze_queue(struct request_queue *q)
{
	/*
	 * In the !blk_mq case we are only calling this to kill the
	 * q_usage_counter, otherwise this increases the freeze depth
	 * and waits for it to return to zero.  For this reason there is
	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
	 * exported to drivers as the only user for unfreeze is blk_mq.
	 */
	blk_freeze_queue_start(q);
	blk_mq_freeze_queue_wait(q);
}

void blk_mq_freeze_queue(struct request_queue *q)
{
	/*
	 * ...just an alias to keep freeze and unfreeze actions balanced
	 * in the blk_mq_* namespace
	 */
	blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);

void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
	mutex_lock(&q->mq_freeze_lock);
	if (force_atomic)
		q->q_usage_counter.data->force_atomic = true;
	q->mq_freeze_depth--;
	WARN_ON_ONCE(q->mq_freeze_depth < 0);
	if (!q->mq_freeze_depth) {
		percpu_ref_resurrect(&q->q_usage_counter);
		wake_up_all(&q->mq_freeze_wq);
	}
	mutex_unlock(&q->mq_freeze_lock);
}

void blk_mq_unfreeze_queue(struct request_queue *q)
{
	__blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);

/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
	unsigned long flags;

	spin_lock_irqsave(&q->queue_lock, flags);
	if (!q->quiesce_depth++)
		blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
	spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

/**
 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 * @q: request queue.
 *
 * Note: it is driver's responsibility for making sure that quiesce has
 * been started.
 */
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
	if (blk_queue_has_srcu(q))
		synchronize_srcu(q->srcu);
	else
		synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);

/**
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
	blk_mq_quiesce_queue_nowait(q);
	blk_mq_wait_quiesce_done(q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
	unsigned long flags;
	bool run_queue = false;

	spin_lock_irqsave(&q->queue_lock, flags);
	if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
		;
	} else if (!--q->quiesce_depth) {
		blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
		run_queue = true;
	}
	spin_unlock_irqrestore(&q->queue_lock, flags);

	/* dispatch requests which are inserted during quiescing */
	if (run_queue)
		blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

void blk_mq_wake_waiters(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned long i;

	queue_for_each_hw_ctx(q, hctx, i)
		if (blk_mq_hw_queue_mapped(hctx))
			blk_mq_tag_wakeup_all(hctx->tags, true);
}

void blk_rq_init(struct request_queue *q, struct request *rq)
{
	memset(rq, 0, sizeof(*rq));

	INIT_LIST_HEAD(&rq->queuelist);
	rq->q = q;
	rq->__sector = (sector_t) -1;
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
	rq->tag = BLK_MQ_NO_TAG;
	rq->internal_tag = BLK_MQ_NO_TAG;
	rq->start_time_ns = ktime_get_ns();
	rq->part = NULL;
	blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
{
	struct blk_mq_ctx *ctx = data->ctx;
	struct blk_mq_hw_ctx *hctx = data->hctx;
	struct request_queue *q = data->q;
	struct request *rq = tags->static_rqs[tag];

	rq->q = q;
	rq->mq_ctx = ctx;
	rq->mq_hctx = hctx;
	rq->cmd_flags = data->cmd_flags;

	if (data->flags & BLK_MQ_REQ_PM)
		data->rq_flags |= RQF_PM;
	if (blk_queue_io_stat(q))
		data->rq_flags |= RQF_IO_STAT;
	rq->rq_flags = data->rq_flags;

	if (!(data->rq_flags & RQF_ELV)) {
		rq->tag = tag;
		rq->internal_tag = BLK_MQ_NO_TAG;
	} else {
		rq->tag = BLK_MQ_NO_TAG;
		rq->internal_tag = tag;
	}
	rq->timeout = 0;

	if (blk_mq_need_time_stamp(rq))
		rq->start_time_ns = ktime_get_ns();
	else
		rq->start_time_ns = 0;
	rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	rq->alloc_time_ns = alloc_time_ns;
#endif
	rq->io_start_time_ns = 0;
	rq->stats_sectors = 0;
	rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	rq->nr_integrity_segments = 0;
#endif
	rq->end_io = NULL;
	rq->end_io_data = NULL;

	blk_crypto_rq_set_defaults(rq);
	INIT_LIST_HEAD(&rq->queuelist);
	/* tag was already set */
	WRITE_ONCE(rq->deadline, 0);
	req_ref_set(rq, 1);

	if (rq->rq_flags & RQF_ELV) {
		struct elevator_queue *e = data->q->elevator;

		INIT_HLIST_NODE(&rq->hash);
		RB_CLEAR_NODE(&rq->rb_node);

		if (!op_is_flush(data->cmd_flags) &&
		    e->type->ops.prepare_request) {
			e->type->ops.prepare_request(rq);
			rq->rq_flags |= RQF_ELVPRIV;
		}
	}

	return rq;
}

static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
		u64 alloc_time_ns)
{
	unsigned int tag, tag_offset;
	struct blk_mq_tags *tags;
	struct request *rq;
	unsigned long tag_mask;
	int i, nr = 0;

	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
	if (unlikely(!tag_mask))
		return NULL;

	tags = blk_mq_tags_from_data(data);
	for (i = 0; tag_mask; i++) {
		if (!(tag_mask & (1UL << i)))
			continue;
		tag = tag_offset + i;
		prefetch(tags->static_rqs[tag]);
		tag_mask &= ~(1UL << i);
		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
		rq_list_add(data->cached_rq, rq);
		nr++;
	}
	/* caller already holds a reference, add for remainder */
	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
	data->nr_tags -= nr;

	return rq_list_pop(data->cached_rq);
}

static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
	struct request_queue *q = data->q;
	u64 alloc_time_ns = 0;
	struct request *rq;
	unsigned int tag;

	/* alloc_time includes depth and tag waits */
	if (blk_queue_rq_alloc_time(q))
		alloc_time_ns = ktime_get_ns();

	if (data->cmd_flags & REQ_NOWAIT)
		data->flags |= BLK_MQ_REQ_NOWAIT;

	if (q->elevator) {
		struct elevator_queue *e = q->elevator;

		data->rq_flags |= RQF_ELV;

		/*
		 * Flush/passthrough requests are special and go directly to the
		 * dispatch list. Don't include reserved tags in the
		 * limiting, as it isn't useful.
		 */
		if (!op_is_flush(data->cmd_flags) &&
		    !blk_op_is_passthrough(data->cmd_flags) &&
		    e->type->ops.limit_depth &&
		    !(data->flags & BLK_MQ_REQ_RESERVED))
			e->type->ops.limit_depth(data->cmd_flags, data);
	}

retry:
	data->ctx = blk_mq_get_ctx(q);
	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
	if (!(data->rq_flags & RQF_ELV))
		blk_mq_tag_busy(data->hctx);

	/*
	 * Try batched alloc if we want more than 1 tag.
	 */
	if (data->nr_tags > 1) {
		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
		if (rq)
			return rq;
		data->nr_tags = 1;
	}

	/*
	 * Waiting allocations only fail because of an inactive hctx.  In that
	 * case just retry the hctx assignment and tag allocation as CPU hotplug
	 * should have migrated us to an online CPU by now.
	 */
	tag = blk_mq_get_tag(data);
	if (tag == BLK_MQ_NO_TAG) {
		if (data->flags & BLK_MQ_REQ_NOWAIT)
			return NULL;
		/*
		 * Give up the CPU and sleep for a random short time to
		 * ensure that thread using a realtime scheduling class
		 * are migrated off the CPU, and thus off the hctx that
		 * is going away.
		 */
		msleep(3);
		goto retry;
	}

	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
					alloc_time_ns);
}

struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
		blk_mq_req_flags_t flags)
{
	struct blk_mq_alloc_data data = {
		.q		= q,
		.flags		= flags,
		.cmd_flags	= op,
		.nr_tags	= 1,
	};
	struct request *rq;
	int ret;

	ret = blk_queue_enter(q, flags);
	if (ret)
		return ERR_PTR(ret);

	rq = __blk_mq_alloc_requests(&data);
	if (!rq)
		goto out_queue_exit;
	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
	return rq;
out_queue_exit:
	blk_queue_exit(q);
	return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);

struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
	struct blk_mq_alloc_data data = {
		.q		= q,
		.flags		= flags,
		.cmd_flags	= op,
		.nr_tags	= 1,
	};
	u64 alloc_time_ns = 0;
	unsigned int cpu;
	unsigned int tag;
	int ret;

	/* alloc_time includes depth and tag waits */
	if (blk_queue_rq_alloc_time(q))
		alloc_time_ns = ktime_get_ns();

	/*
	 * If the tag allocator sleeps we could get an allocation for a
	 * different hardware context.  No need to complicate the low level
	 * allocator for this for the rare use case of a command tied to
	 * a specific queue.
	 */
	if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
		return ERR_PTR(-EINVAL);

	if (hctx_idx >= q->nr_hw_queues)
		return ERR_PTR(-EIO);

	ret = blk_queue_enter(q, flags);
	if (ret)
		return ERR_PTR(ret);

	/*
	 * Check if the hardware context is actually mapped to anything.
	 * If not tell the caller that it should skip this queue.
	 */
	ret = -EXDEV;
	data.hctx = xa_load(&q->hctx_table, hctx_idx);
	if (!blk_mq_hw_queue_mapped(data.hctx))
		goto out_queue_exit;
	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
	data.ctx = __blk_mq_get_ctx(q, cpu);

	if (!q->elevator)
		blk_mq_tag_busy(data.hctx);
	else
		data.rq_flags |= RQF_ELV;

	ret = -EWOULDBLOCK;
	tag = blk_mq_get_tag(&data);
	if (tag == BLK_MQ_NO_TAG)
		goto out_queue_exit;
	return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
					alloc_time_ns);

out_queue_exit:
	blk_queue_exit(q);
	return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

static void __blk_mq_free_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	const int sched_tag = rq->internal_tag;

	blk_crypto_free_request(rq);
	blk_pm_mark_last_busy(rq);
	rq->mq_hctx = NULL;
	if (rq->tag != BLK_MQ_NO_TAG)
		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
	if (sched_tag != BLK_MQ_NO_TAG)
		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

	if ((rq->rq_flags & RQF_ELVPRIV) &&
	    q->elevator->type->ops.finish_request)
		q->elevator->type->ops.finish_request(rq);

	if (rq->rq_flags & RQF_MQ_INFLIGHT)
		__blk_mq_dec_active_requests(hctx);

	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
		laptop_io_completion(q->disk->bdi);

	rq_qos_done(q, rq);

	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
	if (req_ref_put_and_test(rq))
		__blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);

void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
	struct request *rq;

	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
		blk_mq_free_request(rq);
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
		rq->q->disk ? rq->q->disk->disk_name : "?",
		(unsigned long long) rq->cmd_flags);

	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
	       (unsigned long long)blk_rq_pos(rq),
	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
	       rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);

static void req_bio_endio(struct request *rq, struct bio *bio,
			  unsigned int nbytes, blk_status_t error)
{
	if (unlikely(error)) {
		bio->bi_status = error;
	} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
		/*
		 * Partial zone append completions cannot be supported as the
		 * BIO fragments may end up not being written sequentially.
		 */
		if (bio->bi_iter.bi_size != nbytes)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_iter.bi_sector = rq->__sector;
	}

	bio_advance(bio, nbytes);

	if (unlikely(rq->rq_flags & RQF_QUIET))
		bio_set_flag(bio, BIO_QUIET);
	/* don't actually finish bio if it's part of flush sequence */
	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
		bio_endio(bio);
}

static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
	if (req->part && blk_do_io_stat(req)) {
		const int sgrp = op_stat_group(req_op(req));

		part_stat_lock();
		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
		part_stat_unlock();
	}
}

static void blk_print_req_error(struct request *req, blk_status_t status)
{
	printk_ratelimited(KERN_ERR
		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
		"phys_seg %u prio class %u\n",
		blk_status_to_str(status),
		req->q->disk ? req->q->disk->disk_name : "?",
		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
		req->cmd_flags & ~REQ_OP_MASK,
		req->nr_phys_segments,
		IOPRIO_PRIO_CLASS(req->ioprio));
}

/*
 * Fully end IO on a request. Does not support partial completions, or
 * errors.
 */
static void blk_complete_request(struct request *req)
{
	const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
	int total_bytes = blk_rq_bytes(req);
	struct bio *bio = req->bio;

	trace_block_rq_complete(req, BLK_STS_OK, total_bytes);

	if (!bio)
		return;

#ifdef CONFIG_BLK_DEV_INTEGRITY
	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
		req->q->integrity.profile->complete_fn(req, total_bytes);
#endif

	blk_account_io_completion(req, total_bytes);

	do {
		struct bio *next = bio->bi_next;

		/* Completion has already been traced */
		bio_clear_flag(bio, BIO_TRACE_COMPLETION);

		if (req_op(req) == REQ_OP_ZONE_APPEND)
			bio->bi_iter.bi_sector = req->__sector;

		if (!is_flush)
			bio_endio(bio);
		bio = next;
	} while (bio);

	/*
	 * Reset counters so that the request stacking driver
	 * can find how many bytes remain in the request
	 * later.
	 */
	req->bio = NULL;
	req->__data_len = 0;
}

/**
 * blk_update_request - Complete multiple bytes without completing the request
 * @req:      the request being processed
 * @error:    block status code
 * @nr_bytes: number of bytes to complete for @req
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 *     the request structure even if @req doesn't have leftover.
 *     If @req has leftover, sets it up for the next range of segments.
 *
 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 *     %false return from this function.
 *
 * Note:
 *	The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 *      except in the consistency check at the end of this function.
 *
 * Return:
 *     %false - this request doesn't have any more data
 *     %true  - this request has more data
 **/
bool blk_update_request(struct request *req, blk_status_t error,
		unsigned int nr_bytes)
{
	int total_bytes;

	trace_block_rq_complete(req, error, nr_bytes);

	if (!req->bio)
		return false;

#ifdef CONFIG_BLK_DEV_INTEGRITY
	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
	    error == BLK_STS_OK)
		req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif

	if (unlikely(error && !blk_rq_is_passthrough(req) &&
		     !(req->rq_flags & RQF_QUIET))) {
		blk_print_req_error(req, error);
		trace_block_rq_error(req, error, nr_bytes);
	}

	blk_account_io_completion(req, nr_bytes);

	total_bytes = 0;
	while (req->bio) {
		struct bio *bio = req->bio;
		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);

		if (bio_bytes == bio->bi_iter.bi_size)
			req->bio = bio->bi_next;

		/* Completion has already been traced */
		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
		req_bio_endio(req, bio, bio_bytes, error);

		total_bytes += bio_bytes;
		nr_bytes -= bio_bytes;

		if (!nr_bytes)
			break;
	}

	/*
	 * completely done
	 */
	if (!req->bio) {
		/*
		 * Reset counters so that the request stacking driver
		 * can find how many bytes remain in the request
		 * later.
		 */
		req->__data_len = 0;
		return false;
	}

	req->__data_len -= total_bytes;

	/* update sector only for requests with clear definition of sector */
	if (!blk_rq_is_passthrough(req))
		req->__sector += total_bytes >> 9;

	/* mixed attributes always follow the first bio */
	if (req->rq_flags & RQF_MIXED_MERGE) {
		req->cmd_flags &= ~REQ_FAILFAST_MASK;
		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
	}

	if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
		/*
		 * If total number of sectors is less than the first segment
		 * size, something has gone terribly wrong.
		 */
		if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
			blk_dump_rq_flags(req, "request botched");
			req->__data_len = blk_rq_cur_bytes(req);
		}

		/* recalculate the number of segments */
		req->nr_phys_segments = blk_recalc_rq_segments(req);
	}

	return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);

static void __blk_account_io_done(struct request *req, u64 now)
{
	const int sgrp = op_stat_group(req_op(req));

	part_stat_lock();
	update_io_ticks(req->part, jiffies, true);
	part_stat_inc(req->part, ios[sgrp]);
	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
	part_stat_unlock();
}

static inline void blk_account_io_done(struct request *req, u64 now)
{
	/*
	 * Account IO completion.  flush_rq isn't accounted as a
	 * normal IO on queueing nor completion.  Accounting the
	 * containing request is enough.
	 */
	if (blk_do_io_stat(req) && req->part &&
	    !(req->rq_flags & RQF_FLUSH_SEQ))
		__blk_account_io_done(req, now);
}

static void __blk_account_io_start(struct request *rq)
{
	/*
	 * All non-passthrough requests are created from a bio with one
	 * exception: when a flush command that is part of a flush sequence
	 * generated by the state machine in blk-flush.c is cloned onto the
	 * lower device by dm-multipath we can get here without a bio.
	 */
	if (rq->bio)
		rq->part = rq->bio->bi_bdev;
	else
		rq->part = rq->q->disk->part0;

	part_stat_lock();
	update_io_ticks(rq->part, jiffies, false);
	part_stat_unlock();
}

static inline void blk_account_io_start(struct request *req)
{
	if (blk_do_io_stat(req))
		__blk_account_io_start(req);
}

static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
	if (rq->rq_flags & RQF_STATS) {
		blk_mq_poll_stats_start(rq->q);
		blk_stat_add(rq, now);
	}

	blk_mq_sched_completed_request(rq, now);
	blk_account_io_done(rq, now);
}

inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
	if (blk_mq_need_time_stamp(rq))
		__blk_mq_end_request_acct(rq, ktime_get_ns());

	if (rq->end_io) {
		rq_qos_done(rq->q, rq);
		rq->end_io(rq, error);
	} else {
		blk_mq_free_request(rq);
	}
}
EXPORT_SYMBOL(__blk_mq_end_request);

void blk_mq_end_request(struct request *rq, blk_status_t error)
{
	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
		BUG();
	__blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);

#define TAG_COMP_BATCH		32

static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
					  int *tag_array, int nr_tags)
{
	struct request_queue *q = hctx->queue;

	/*
	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
	 * update hctx->nr_active in batch
	 */
	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
		__blk_mq_sub_active_requests(hctx, nr_tags);

	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}

void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
	int tags[TAG_COMP_BATCH], nr_tags = 0;
	struct blk_mq_hw_ctx *cur_hctx = NULL;
	struct request *rq;
	u64 now = 0;

	if (iob->need_ts)
		now = ktime_get_ns();

	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
		prefetch(rq->bio);
		prefetch(rq->rq_next);

		blk_complete_request(rq);
		if (iob->need_ts)
			__blk_mq_end_request_acct(rq, now);

		rq_qos_done(rq->q, rq);

		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
		if (!req_ref_put_and_test(rq))
			continue;

		blk_crypto_free_request(rq);
		blk_pm_mark_last_busy(rq);

		if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
			if (cur_hctx)
				blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
			nr_tags = 0;
			cur_hctx = rq->mq_hctx;
		}
		tags[nr_tags++] = rq->tag;
	}

	if (nr_tags)