summaryrefslogtreecommitdiff
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c411
1 files changed, 274 insertions, 137 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d6dd2e047874..bd15357f23bd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios_bps);
+ bio_list_init(&qn->bios_iops);
qn->tg = tg;
}
@@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
- * @queued: the service_queue->queued[] list @qn belongs to
+ * @sq: the service_queue @qn belongs to
*
- * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * Add @bio to @qn and put @qn on @sq->queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
- struct list_head *queued)
+ struct throtl_service_queue *sq)
{
- bio_list_add(&qn->bios, bio);
+ bool rw = bio_data_dir(bio);
+
+ /*
+ * Split bios have already been throttled by bps, so they are
+ * directly queued into the iops path.
+ */
+ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ bio_list_add(&qn->bios_iops, bio);
+ sq->nr_queued_iops[rw]++;
+ } else {
+ bio_list_add(&qn->bios_bps, bio);
+ sq->nr_queued_bps[rw]++;
+ }
+
if (list_empty(&qn->node)) {
- list_add_tail(&qn->node, queued);
+ list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
@@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
+ *
+ * Always take a bio from the head of the iops queue first. If the queue is
+ * empty, we then take it from the bps queue to maintain the overall idea of
+ * fetching bios from the head.
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
@@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = bio_list_peek(&qn->bios_iops);
+ if (!bio)
+ bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
- * @queued: the qnode list to pop a bio from
+ * @sq: the service_queue to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
+ * @rw: read/write
*
- * Pop the first bio from the qnode list @queued. After popping, the first
- * qnode is removed from @queued if empty or moved to the end of @queued so
- * that the popping order is round-robin.
+ * Pop the first bio from the qnode list @sq->queued. Note that we firstly
+ * focus on the iops list because bios are ultimately dispatched from it.
+ * After popping, the first qnode is removed from @sq->queued if empty or moved
+ * to the end of @sq->queued so that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
-static struct bio *throtl_pop_queued(struct list_head *queued,
- struct throtl_grp **tg_to_put)
+static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
+ struct throtl_grp **tg_to_put, bool rw)
{
+ struct list_head *queued = &sq->queued[rw];
struct throtl_qnode *qn;
struct bio *bio;
@@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = bio_list_pop(&qn->bios_iops);
+ if (bio) {
+ sq->nr_queued_iops[rw]--;
+ } else {
+ bio = bio_list_pop(&qn->bios_bps);
+ if (bio)
+ sq->nr_queued_bps[rw]--;
+ }
WARN_ON_ONCE(!bio);
- if (bio_list_empty(&qn->bios)) {
+ if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
+ if (!time_before(tg->slice_end[rw], jiffy_end))
+ return;
+
throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
@@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
+static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
+{
+ return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
+}
+
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
@@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
+static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ long long bytes_trim;
+
+ if (bps_limit == U64_MAX)
+ return 0;
+
+ /* Need to consider the case of bytes_allowed overflow. */
+ bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
+ if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
+ bytes_trim = tg->bytes_disp[rw];
+ tg->bytes_disp[rw] = 0;
+ } else {
+ tg->bytes_disp[rw] -= bytes_trim;
+ }
+
+ return bytes_trim;
+}
+
+static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u32 iops_limit = tg_iops_limit(tg, rw);
+ int io_trim;
+
+ if (iops_limit == UINT_MAX)
+ return 0;
+
+ /* Need to consider the case of io_allowed overflow. */
+ io_trim = calculate_io_allowed(iops_limit, time_elapsed);
+ if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
+ io_trim = tg->io_disp[rw];
+ tg->io_disp[rw] = 0;
+ } else {
+ tg->io_disp[rw] -= io_trim;
+ }
+
+ return io_trim;
+}
+
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
@@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* one extra slice is preserved for deviation.
*/
time_elapsed -= tg->td->throtl_slice;
- bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
- time_elapsed);
- io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed);
- if (bytes_trim <= 0 && io_trim <= 0)
+ bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
+ io_trim = throtl_trim_iops(tg, rw, time_elapsed);
+ if (!bytes_trim && !io_trim)
return;
- if ((long long)tg->bytes_disp[rw] >= bytes_trim)
- tg->bytes_disp[rw] -= bytes_trim;
- else
- tg->bytes_disp[rw] = 0;
-
- if ((int)tg->io_disp[rw] >= io_trim)
- tg->io_disp[rw] -= io_trim;
- else
- tg->io_disp[rw] = 0;
-
tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
@@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
+ long long bytes_allowed;
+ int io_allowed;
+
+ /*
+ * If the queue is empty, carryover handling is not needed. In such cases,
+ * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
+ * of subsequent bios. The same handling applies when the previous BPS/IOPS
+ * limit was set to max.
+ */
+ if (sq_queued(&tg->service_queue, rw) == 0) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ return;
+ }
/*
* If config is updated while bios are still throttled, calculate and
- * accumulate how many bytes/ios are waited across changes. And
- * carryover_bytes/ios will be used to calculate new wait time under new
- * configuration.
+ * accumulate how many bytes/ios are waited across changes. And use the
+ * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
+ * will be used to calculate new wait time under new configuration.
+ * And we need to consider the case of bytes/io_allowed overflow.
*/
- if (bps_limit != U64_MAX)
- *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
- tg->bytes_disp[rw];
- if (iops_limit != UINT_MAX)
- *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
- tg->io_disp[rw];
- tg->bytes_disp[rw] -= *bytes;
- tg->io_disp[rw] -= *ios;
+ if (bps_limit != U64_MAX) {
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
+ if (bytes_allowed > 0)
+ *bytes = bytes_allowed - tg->bytes_disp[rw];
+ }
+ if (iops_limit != UINT_MAX) {
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
+ if (io_allowed > 0)
+ *ios = io_allowed - tg->io_disp[rw];
+ }
+
+ tg->bytes_disp[rw] = -*bytes;
+ tg->io_disp[rw] = -*ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
@@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg)
long long bytes[2] = {0};
int ios[2] = {0};
- if (tg->service_queue.nr_queued[READ])
- __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
- if (tg->service_queue.nr_queued[WRITE])
- __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
+ __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
+ __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
- /* see comments in struct throtl_grp for meaning of these fields. */
+ /* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
@@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
- if (iops_limit == UINT_MAX) {
- return 0;
- }
-
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
@@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- /* no need to throttle if this bio's bytes have been accounted */
- if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- return 0;
- }
-
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
- if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ /* Need to consider the case of bytes_allowed overflow. */
+ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ || bytes_allowed < 0)
return 0;
/* Calc approx time to dispatch */
@@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
return jiffy_wait;
}
+static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ unsigned int bio_size = throtl_bio_data_size(bio);
+
+ /* Charge the bio to the group */
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
+ bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->bytes_disp[bio_data_dir(bio)] += bio_size;
+ }
+}
+
+static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->io_disp[bio_data_dir(bio)]++;
+}
+
/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ * If previous slice expired, start a new one otherwise renew/extend existing
+ * slice to make sure it is at least throtl_slice interval long since now. New
+ * slice is started only for empty throttle group. If there is queued bio, that
+ * means there should be an active slice and it should be extended instead.
*/
-static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static void tg_update_slice(struct throtl_grp *tg, bool rw)
+{
+ if (throtl_slice_used(tg, rw) &&
+ sq_queued(&tg->service_queue, rw) == 0)
+ throtl_start_new_slice(tg, rw, true);
+ else
+ throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
+}
+
+static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
- unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
+ unsigned long bps_wait;
+
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
+ bio_flagged(bio, BIO_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_TG_BPS_THROTTLED))
+ return 0;
+
+ tg_update_slice(tg, rw);
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ throtl_extend_slice(tg, rw, jiffies + bps_wait);
+
+ return bps_wait;
+}
+
+static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
u32 iops_limit = tg_iops_limit(tg, rw);
+ unsigned long iops_wait;
+
+ if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
+ return 0;
+
+ tg_update_slice(tg, rw);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ throtl_extend_slice(tg, rw, jiffies + iops_wait);
+
+ return iops_wait;
+}
+
+/*
+ * Returns approx number of jiffies to wait before this bio is with-in IO rate
+ * and can be moved to other queue or dispatched.
+ */
+static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+ unsigned long wait;
/*
* Currently whole state machine of group depends on first bio
@@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->service_queue.nr_queued[rw] &&
+ BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
- /* If tg->bps = -1, then BW is unlimited */
- if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
- tg->flags & THROTL_TG_CANCELING) {
- if (wait)
- *wait = 0;
- return true;
- }
+ wait = tg_dispatch_bps_time(tg, bio);
+ if (wait != 0)
+ return wait;
/*
- * If previous slice expired, start a new one otherwise renew/extend
- * existing slice to make sure it is at least throtl_slice interval
- * long since now. New slice is started only for empty throttle group.
- * If there is queued bio, that means there should be an active
- * slice and it should be extended instead.
+ * Charge bps here because @bio will be directly placed into the
+ * iops queue afterward.
*/
- if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
- throtl_start_new_slice(tg, rw, true);
- else {
- if (time_before(tg->slice_end[rw],
- jiffies + tg->td->throtl_slice))
- throtl_extend_slice(tg, rw,
- jiffies + tg->td->throtl_slice);
- }
-
- bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
- iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
- if (bps_wait + iops_wait == 0) {
- if (wait)
- *wait = 0;
- return true;
- }
-
- max_wait = max(bps_wait, iops_wait);
-
- if (wait)
- *wait = max_wait;
-
- if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(tg, rw, jiffies + max_wait);
-
- return false;
-}
-
-static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
-{
- bool rw = bio_data_dir(bio);
- unsigned int bio_size = throtl_bio_data_size(bio);
+ throtl_charge_bps_bio(tg, bio);
- /* Charge the bio to the group */
- if (!bio_flagged(bio, BIO_BPS_THROTTLED))
- tg->bytes_disp[rw] += bio_size;
-
- tg->io_disp[rw]++;
+ return tg_dispatch_iops_time(tg, bio);
}
/**
@@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleared on the next tg_update_disptime().
*/
- if (!sq->nr_queued[rw])
+ if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
- throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+ throtl_qnode_add_bio(bio, qn, sq);
+
+ /*
+ * Since we have split the queues, when the iops queue is
+ * previously empty and a new @bio is added into the first @qn,
+ * we also need to update the @tg->disptime.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ bio == throtl_peek_queued(&sq->queued[rw]))
+ tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
- sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
}
static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+ unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
struct bio *bio;
bio = throtl_peek_queued(&sq->queued[READ]);
if (bio)
- tg_may_dispatch(tg, bio, &read_wait);
+ read_wait = tg_dispatch_time(tg, bio);
bio = throtl_peek_queued(&sq->queued[WRITE]);
if (bio)
- tg_may_dispatch(tg, bio, &write_wait);
+ write_wait = tg_dispatch_time(tg, bio);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
+ tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
}
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
@@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
- bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
- sq->nr_queued[rw]--;
+ bio = throtl_pop_queued(sq, &tg_to_put, rw);
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
- &parent_sq->queued[rw]);
+ parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
@@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
/* Try to dispatch 75% READS and 25% WRITES */
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, READ);
nr_reads++;
@@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
}
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
@@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
- if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
+ if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
@@ -1037,9 +1149,11 @@ again:
dispatched = false;
while (true) {
+ unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
+ unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
+
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
- sq->nr_queued[READ] + sq->nr_queued[WRITE],
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
ret = throtl_select_dispatch(sq);
if (ret) {
@@ -1061,7 +1175,8 @@ again:
if (parent_sq) {
/* @parent_sq is another throl_grp, propagate dispatch */
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
@@ -1101,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
@@ -1606,11 +1721,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
- /* throtl is FIFO - if bios are already queued, should queue */
- if (tg->service_queue.nr_queued[rw])
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * For a split bio, we need to specifically distinguish whether the
+ * iops queue is empty.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED))
+ return sq->nr_queued_iops[rw] == 0 &&
+ tg_dispatch_iops_time(tg, bio) == 0;
+
+ /*
+ * Throtl is FIFO - if bios are already queued, should queue.
+ * If the bps queue is empty and @bio is within the bps limit, charge
+ * bps here for direct placement into the iops queue.
+ */
+ if (sq_queued(&tg->service_queue, rw)) {
+ if (sq->nr_queued_bps[rw] == 0 &&
+ tg_dispatch_bps_time(tg, bio) == 0)
+ throtl_charge_bps_bio(tg, bio);
+
return false;
+ }
- return tg_may_dispatch(tg, bio, NULL);
+ return tg_dispatch_time(tg, bio) == 0;
}
bool __blk_throtl_bio(struct bio *bio)
@@ -1631,7 +1765,7 @@ bool __blk_throtl_bio(struct bio *bio)
while (true) {
if (tg_within_limit(tg, bio, rw)) {
/* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* We need to trim slice even when bios are not being
@@ -1654,7 +1788,8 @@ bool __blk_throtl_bio(struct bio *bio)
* control algorithm is adaptive, and extra IO bytes
* will be throttled for paying the debt
*/
- throtl_charge_bio(tg, bio);
+ throtl_charge_bps_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
@@ -1680,7 +1815,7 @@ bool __blk_throtl_bio(struct bio *bio)
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ sq_queued(sq, READ), sq_queued(sq, WRITE));
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@@ -1688,11 +1823,13 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* Update @tg's dispatch time and force schedule dispatch if @tg
- * was empty before @bio. The forced scheduling isn't likely to
- * cause undue delay as @bio is likely to be dispatched directly if
- * its @tg's disptime is not in the future.
+ * was empty before @bio, or the iops queue is empty and @bio will
+ * add to. The forced scheduling isn't likely to cause undue
+ * delay as @bio is likely to be dispatched directly if its @tg's
+ * disptime is not in the future.
*/
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}