Newer
Older
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
* blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
if (next != 0) {
mod_timer(&q->timeout, next);
/*
* Request timeouts are handled as a forward rolling timer. If
* we end up here it means that no requests are pending and
* also that no request has been pending for a while. Mark
* each hctx as idle.
*/
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
struct flush_busy_ctx_data {
struct blk_mq_hw_ctx *hctx;
struct list_head *list;
};
static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
struct flush_busy_ctx_data *flush_data = data;
struct blk_mq_hw_ctx *hctx = flush_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
spin_lock(&ctx->lock);
list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
sbitmap_clear_bit(sb, bitnr);
spin_unlock(&ctx->lock);
return true;
}
/*
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
struct flush_busy_ctx_data data = {
.hctx = hctx,
.list = list,
};
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
struct dispatch_rq_data {
struct blk_mq_hw_ctx *hctx;
struct request *rq;
};
static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
void *data)
{
struct dispatch_rq_data *dispatch_data = data;
struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
if (!list_empty(&ctx->rq_lists[type])) {
dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
list_del_init(&dispatch_data->rq->queuelist);
sbitmap_clear_bit(sb, bitnr);
}
spin_unlock(&ctx->lock);
return !dispatch_data->rq;
}
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start)
{
unsigned off = start ? start->index_hw[hctx->type] : 0;
struct dispatch_rq_data data = {
.hctx = hctx,
.rq = NULL,
};
__sbitmap_for_each_set(&hctx->ctx_map, off,
dispatch_rq_from_ctx, &data);
return data.rq;
}
static inline unsigned int queued_to_index(unsigned int queued)
{
if (!queued)
return 0;
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
static bool __blk_mq_get_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
bt = &rq->mq_hctx->tags->breserved_tags;
} else {
if (!hctx_may_queue(rq->mq_hctx, bt))
return false;
}
tag = __sbitmap_queue_get(bt);
if (tag == BLK_MQ_NO_TAG)
return false;
rq->tag = tag + tag_offset;
return true;
}
bool blk_mq_get_driver_tag(struct request *rq)
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
return false;
if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
__blk_mq_inc_active_requests(hctx);
}
hctx->tags->rqs[rq->tag] = rq;
return true;
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
struct sbitmap_queue *sbq;
list_del_init(&wait->entry);
sbq = &hctx->tags->bitmap_tags;
atomic_dec(&sbq->ws_active);
}
spin_unlock(&hctx->dispatch_wait_lock);
blk_mq_run_hw_queue(hctx, true);
return 1;
}
/*
* Mark us waiting for a tag. For shared tags, this involves hooking us into
* the tag wakeups. For non-shared tags, we can simply mark us needing a
* restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
blk_mq_sched_mark_restart_hctx(hctx);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
wait = &hctx->dispatch_wait;
if (!list_empty_careful(&wait->entry))
return false;
wq = &bt_wait_ptr(sbq, hctx)->wait;
spin_lock_irq(&wq->lock);
spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return false;
atomic_inc(&sbq->ws_active);
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
__add_wait_queue(wq, wait);
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
if (!ret) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return false;
/*
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
list_del_init(&wait->entry);
atomic_dec(&sbq->ws_active);
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return true;
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
/*
* Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
* - EWMA is one simple way to compute running average value
* - weight(7/8 and 1/8) is applied so that it can decrease exponentially
* - take 4 as factor for avoiding to get too small(0) result, and this
* factor doesn't matter because EWMA decreases exponentially
*/
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
return;
ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
if (busy)
ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
hctx->dispatch_busy = ewma;
}
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
static void blk_mq_handle_dev_resource(struct request *rq,
struct list_head *list)
{
struct request *next =
list_first_entry_or_null(list, struct request, queuelist);
/*
* If an I/O scheduler has been configured and we got a driver tag for
* the next request already, free it.
*/
if (next)
blk_mq_put_driver_tag(next);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
}
static void blk_mq_handle_zone_resource(struct request *rq,
struct list_head *zone_list)
{
/*
* If we end up here it is because we cannot dispatch a request to a
* specific zone due to LLD level zone-write locking or other zone
* related resource not being available. In this case, set the request
* aside in zone_list for retrying it later.
*/
list_add(&rq->queuelist, zone_list);
__blk_mq_requeue_request(rq);
}
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
PREP_DISPATCH_NO_BUDGET,
};
static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
bool need_budget)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
int budget_token = -1;
if (need_budget) {
budget_token = blk_mq_get_dispatch_budget(rq->q);
if (budget_token < 0) {
blk_mq_put_driver_tag(rq);
return PREP_DISPATCH_NO_BUDGET;
}
blk_mq_set_rq_budget_token(rq, budget_token);
}
if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
* waitqueue takes care of that. If the queue is run
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
if (!blk_mq_mark_tag_wait(hctx, rq)) {
/*
* All budgets not got from this function will be put
* together during handling partial dispatch
*/
if (need_budget)
blk_mq_put_dispatch_budget(rq->q, budget_token);
return PREP_DISPATCH_NO_TAG;
}
}
return PREP_DISPATCH_OK;
}
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
struct list_head *list)
struct request *rq;
list_for_each_entry(rq, list, queuelist) {
int budget_token = blk_mq_get_rq_budget_token(rq);
if (budget_token >= 0)
blk_mq_put_dispatch_budget(q, budget_token);
}
/*
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
unsigned int nr_budgets)
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
struct request *rq, *nxt;
int errors, queued;
if (list_empty(list))
return false;
/*
* Now process all the entries, sending them to the driver.
*/
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
if (prep != PREP_DISPATCH_OK)
list_del_init(&rq->queuelist);
/*
* Flag last if we have no more requests, or if we have more
* but can't assign a driver tag to it.
*/
if (list_empty(list))
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt);
/*
* once the request is queued to lld, no need to cover the
* budget any more
*/
if (nr_budgets)
nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
queued++;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
break;
default:
blk_mq_end_request(rq, ret);
} while (!list_empty(list));
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
hctx->dispatched[queued_to_index(queued)]++;
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
q->mq_ops->commit_rqs(hctx);
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
/* For non-shared tags, the RESTART check will suffice */
bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
if (nr_budgets)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
* Order adding requests to hctx->dispatch and checking
* SCHED_RESTART flag. The pair of this smp_mb() is the one
* in blk_mq_sched_restart(). Avoid restart code path to
* miss the new added requests to hctx->dispatch, meantime
* SCHED_RESTART is observed here.
*/
smp_mb();
* If SCHED_RESTART was set by the caller of this function and
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
* If 'no_tag' is set, that means that we failed getting
* a driver tag with an I/O scheduler attached. If our dispatch
* waitqueue is no longer active, ensure that we run the queue
* AFTER adding our entries back to the list.
* If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests
* were pushed back onto the dispatch list. Rerun the queue to
* avoid starvation. Notes:
* - blk_mq_run_hw_queue() checks whether or not a queue has
* been stopped before rerunning a queue.
* - Some but not all block drivers stop a queue before
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
*
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
* that could otherwise occur if the queue is idle. We'll do
* similar if we couldn't get budget and SCHED_RESTART is set.
needs_restart = blk_mq_sched_needs_restart(hctx);
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart && (ret == BLK_STS_RESOURCE ||
no_budget_avail))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
} else
blk_mq_update_dispatch_busy(hctx, false);
return (queued + errors) != 0;
/**
* __blk_mq_run_hw_queue - Run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
*
* Send pending requests to the hardware.
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
int srcu_idx;
/*
* We can't run the queue inline with ints disabled. Ensure that
* we catch bad users of this early.
*/
WARN_ON_ONCE(in_interrupt());
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
blk_mq_sched_dispatch_requests(hctx);
hctx_unlock(hctx, srcu_idx);
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(hctx->cpumask);
return cpu;
}
/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
* BLK_MQ_CPU_WORK_BATCH queued items.
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
int next_cpu = hctx->next_cpu;
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
/*
* Do unbound schedule if we can't find a online CPU for this hctx,
* and it should only happen in the path of handling CPU DEAD.
*/
if (!cpu_online(next_cpu)) {
if (!tried) {
tried = true;
goto select_cpu;
}
/*
* Make sure to re-select CPU next time once after CPUs
* in hctx->cpumask become online again.
*/
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = 1;
return WORK_CPU_UNBOUND;
}
hctx->next_cpu = next_cpu;
return next_cpu;
}
/**
* __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
* @hctx: Pointer to the hardware queue to run.
* @async: If we want to run the queue asynchronously.
* @msecs: Milliseconds of delay to wait before running the queue.
*
* If !@async, try to run the queue now. Else, run the queue asynchronously and
* with a delay of @msecs.
*/
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
if (unlikely(blk_mq_hctx_stopped(hctx)))
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
Paolo Bonzini
committed
__blk_mq_run_hw_queue(hctx);
put_cpu();
Paolo Bonzini
committed
return;
}
put_cpu();
Paolo Bonzini
committed
Bart Van Assche
committed
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
/**
* blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
* @hctx: Pointer to the hardware queue to run.
* @msecs: Milliseconds of delay to wait before running the queue.
*
* Run a hardware queue asynchronously with a delay of @msecs.
*/
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
__blk_mq_delay_run_hw_queue(hctx, true, msecs);
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
* @async: If we want to run the queue asynchronously.
*
* Check if the request queue is not in a quiesced state and if there are
* pending requests to be sent. If this is true, run the queue to send requests
* to hardware.
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
int srcu_idx;
bool need_run;
/*
* When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue
* any more, even __blk_mq_hctx_has_pending() can't be called safely.
*
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
hctx_lock(hctx, &srcu_idx);
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx);
hctx_unlock(hctx, srcu_idx);
__blk_mq_delay_run_hw_queue(hctx, async, 0);
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
/*
* Is the request queue handled by an IO scheduler that does not respect
* hardware queues when dispatching?
*/
static bool blk_mq_has_sqsched(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
if (e && e->type->ops.dispatch_request &&
!(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
return true;
return false;
}
/*
* Return prefered queue to dispatch from (if any) for non-mq aware IO
* scheduler.
*/
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
/*
* If the IO scheduler does not respect hardware queues when
* dispatching, we just don't bother with multiple HW queues and
* dispatch from hctx for the current CPU since running multiple queues
* just causes lock contention inside the scheduler and pointless cache
* bouncing.
*/
hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
raw_smp_processor_id());
if (!blk_mq_hctx_stopped(hctx))
return hctx;
return NULL;
}
* blk_mq_run_hw_queues - Run all hardware queues in a request queue.
* @q: Pointer to the request queue to run.
* @async: If we want to run the queue asynchronously.
*/
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
struct blk_mq_hw_ctx *hctx, *sq_hctx;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
/*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
*/
if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_run_hw_queue(hctx, async);
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
* blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
* @q: Pointer to the request queue to run.
* @msecs: Milliseconds of delay to wait before running the queues.
*/
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
/*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
*/
if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
/**
* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
* @q: request queue.
*
* The caller is responsible for serializing this function against
* blk_mq_{start,stop}_hw_queue().
*/
bool blk_mq_queue_stopped(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hctx_stopped(hctx))
return true;
return false;
}
EXPORT_SYMBOL(blk_mq_queue_stopped);
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
* BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queue() returns. Please use
* blk_mq_quiesce_queue() for that requirement.
*/
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_delayed_work(&hctx->run_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
* BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queues() returns. Please use
* blk_mq_quiesce_queue() for that requirement.
*/
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
blk_mq_run_hw_queue(hctx, false);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
if (!blk_mq_hctx_stopped(hctx))
return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
* If we are stopped, don't run the queue.
if (blk_mq_hctx_stopped(hctx))
__blk_mq_run_hw_queue(hctx);
}
static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
struct request *rq,
bool at_head)
lockdep_assert_held(&ctx->lock);
trace_block_rq_insert(rq);
if (at_head)
list_add(&rq->queuelist, &ctx->rq_lists[type]);
list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
lockdep_assert_held(&ctx->lock);
__blk_mq_insert_req_list(hctx, rq, at_head);
blk_mq_hctx_mark_pending(hctx, ctx);
}
/**
* blk_mq_request_bypass_insert - Insert a request at dispatch list.
* @rq: Pointer to request to be inserted.
* @at_head: true if the request should be inserted at the head of the list.
* @run_queue: If we should run the hardware queue after inserting the request.
*
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue)
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
if (at_head)
list_add(&rq->queuelist, &hctx->dispatch);
else
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
if (run_queue)
blk_mq_run_hw_queue(hctx, false);
}
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list)
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
list_for_each_entry(rq, list, queuelist) {
trace_block_rq_insert(rq);
spin_lock(&ctx->lock);
list_splice_tail_init(list, &ctx->rq_lists[type]);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
}
static int plug_rq_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct request *rqa = container_of(a, struct request, queuelist);
struct request *rqb = container_of(b, struct request, queuelist);
if (rqa->mq_ctx != rqb->mq_ctx)
return rqa->mq_ctx > rqb->mq_ctx;
if (rqa->mq_hctx != rqb->mq_hctx)
return rqa->mq_hctx > rqb->mq_hctx;
return blk_rq_pos(rqa) > blk_rq_pos(rqb);
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
LIST_HEAD(list);
if (list_empty(&plug->mq_list))
return;
list_splice_init(&plug->mq_list, &list);
if (plug->rq_count > 2 && plug->multiple_queues)
list_sort(NULL, &list, plug_rq_cmp);
plug->rq_count = 0;
do {
struct list_head rq_list;
struct request *rq, *head_rq = list_entry_rq(list.next);
struct list_head *pos = &head_rq->queuelist; /* skip first */
struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
unsigned int depth = 1;
list_for_each_continue(pos, &list) {
rq = list_entry_rq(pos);
BUG_ON(!rq->q);
if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
break;
depth++;
list_cut_before(&rq_list, &list, pos);
trace_block_unplug(head_rq->q, depth, !from_schedule);
blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
} while(!list_empty(&list));
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
unsigned int nr_segs)
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);