mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-26 21:52:20 +00:00

[BUG] There has an issue of io delayed dispatch caused by io splitting. Consider the following scenario: 1) If we set a BPS limit of 1MB/s and restrict the maximum IO size per dispatch to 4KB, submitting -two- 1MB IO requests results in completion times of 1s and 2s, which is expected. 2) However, if we additionally set an IOPS limit of 1,000,000/s with the same BPS limit of 1MB/s, submitting -two- 1MB IO requests again results in both completing in 2s, even though the IOPS constraint is being met. [CAUSE] This issue arises because BPS and IOPS currently share the same queue in the blkthrotl mechanism: 1) This issue does not occur when only BPS is limited because the split IOs return false in blk_should_throtl() and do not go through to throtl again. 2) For split IOs, even if they have been tagged with BIO_BPS_THROTTLED, they still get queued alternately in the same list due to continuous splitting and reordering. As a result, the two IO requests are both completed at the 2-second mark, causing an unintended delay. 3) It is not difficult to imagine that in this scenario, if N 1MB IOs are issued at once, all IOs will eventually complete together in N seconds. [FIX] With the queue separation introduced in the previous patches, we now have separate BPS and IOPS queues. For IOs that have already passed the BPS limitation, they do not need to re-enter the BPS queue and can directly placed to the IOPS queue. Since we have split the queues, when the IOPS queue is previously empty and a new bio is added to the first qnode->bios_iops list in the service_queue, we also need to update the disptime. This patch introduces "THROTL_TG_IOPS_WAS_EMPTY" flag to mark it. Signed-off-by: Zizhi Wo <wozizhi@huawei.com> Reviewed-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Zizhi Wo <wozizhi@huaweicloud.com> Link: https://lore.kernel.org/r/20250506020935.655574-8-wozizhi@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
206 lines
6.2 KiB
C
206 lines
6.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef BLK_THROTTLE_H
|
|
#define BLK_THROTTLE_H
|
|
|
|
#include "blk-cgroup-rwstat.h"
|
|
|
|
/*
|
|
* To implement hierarchical throttling, throtl_grps form a tree and bios
|
|
* are dispatched upwards level by level until they reach the top and get
|
|
* issued. When dispatching bios from the children and local group at each
|
|
* level, if the bios are dispatched into a single bio_list, there's a risk
|
|
* of a local or child group which can queue many bios at once filling up
|
|
* the list starving others.
|
|
*
|
|
* To avoid such starvation, dispatched bios are queued separately
|
|
* according to where they came from. When they are again dispatched to
|
|
* the parent, they're popped in round-robin order so that no single source
|
|
* hogs the dispatch window.
|
|
*
|
|
* throtl_qnode is used to keep the queued bios separated by their sources.
|
|
* Bios are queued to throtl_qnode which in turn is queued to
|
|
* throtl_service_queue and then dispatched in round-robin order.
|
|
*
|
|
* It's also used to track the reference counts on blkg's. A qnode always
|
|
* belongs to a throtl_grp and gets queued on itself or the parent, so
|
|
* incrementing the reference of the associated throtl_grp when a qnode is
|
|
* queued and decrementing when dequeued is enough to keep the whole blkg
|
|
* tree pinned while bios are in flight.
|
|
*/
|
|
struct throtl_qnode {
|
|
struct list_head node; /* service_queue->queued[] */
|
|
struct bio_list bios_bps; /* queued bios for bps limit */
|
|
struct bio_list bios_iops; /* queued bios for iops limit */
|
|
struct throtl_grp *tg; /* tg this qnode belongs to */
|
|
};
|
|
|
|
struct throtl_service_queue {
|
|
struct throtl_service_queue *parent_sq; /* the parent service_queue */
|
|
|
|
/*
|
|
* Bios queued directly to this service_queue or dispatched from
|
|
* children throtl_grp's.
|
|
*/
|
|
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
|
|
unsigned int nr_queued_bps[2]; /* number of queued bps bios */
|
|
unsigned int nr_queued_iops[2]; /* number of queued iops bios */
|
|
|
|
/*
|
|
* RB tree of active children throtl_grp's, which are sorted by
|
|
* their ->disptime.
|
|
*/
|
|
struct rb_root_cached pending_tree; /* RB tree of active tgs */
|
|
unsigned int nr_pending; /* # queued in the tree */
|
|
unsigned long first_pending_disptime; /* disptime of the first tg */
|
|
struct timer_list pending_timer; /* fires on first_pending_disptime */
|
|
};
|
|
|
|
enum tg_state_flags {
|
|
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
|
|
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
|
|
/*
|
|
* The sq's iops queue is empty, and a bio is about to be enqueued
|
|
* to the first qnode's bios_iops list.
|
|
*/
|
|
THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
|
|
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
|
|
};
|
|
|
|
struct throtl_grp {
|
|
/* must be the first member */
|
|
struct blkg_policy_data pd;
|
|
|
|
/* active throtl group service_queue member */
|
|
struct rb_node rb_node;
|
|
|
|
/* throtl_data this group belongs to */
|
|
struct throtl_data *td;
|
|
|
|
/* this group's service queue */
|
|
struct throtl_service_queue service_queue;
|
|
|
|
/*
|
|
* qnode_on_self is used when bios are directly queued to this
|
|
* throtl_grp so that local bios compete fairly with bios
|
|
* dispatched from children. qnode_on_parent is used when bios are
|
|
* dispatched from this throtl_grp into its parent and will compete
|
|
* with the sibling qnode_on_parents and the parent's
|
|
* qnode_on_self.
|
|
*/
|
|
struct throtl_qnode qnode_on_self[2];
|
|
struct throtl_qnode qnode_on_parent[2];
|
|
|
|
/*
|
|
* Dispatch time in jiffies. This is the estimated time when group
|
|
* will unthrottle and is ready to dispatch more bio. It is used as
|
|
* key to sort active groups in service tree.
|
|
*/
|
|
unsigned long disptime;
|
|
|
|
unsigned int flags;
|
|
|
|
/* are there any throtl rules between this group and td? */
|
|
bool has_rules_bps[2];
|
|
bool has_rules_iops[2];
|
|
|
|
/* bytes per second rate limits */
|
|
uint64_t bps[2];
|
|
|
|
/* IOPS limits */
|
|
unsigned int iops[2];
|
|
|
|
/*
|
|
* Number of bytes/bio's dispatched in current slice.
|
|
* When new configuration is submitted while some bios are still throttled,
|
|
* first calculate the carryover: the amount of bytes/IOs already waited
|
|
* under the previous configuration. Then, [bytes/io]_disp are represented
|
|
* as the negative of the carryover, and they will be used to calculate the
|
|
* wait time under the new configuration.
|
|
*/
|
|
int64_t bytes_disp[2];
|
|
int io_disp[2];
|
|
|
|
unsigned long last_check_time;
|
|
|
|
/* When did we start a new slice */
|
|
unsigned long slice_start[2];
|
|
unsigned long slice_end[2];
|
|
|
|
struct blkg_rwstat stat_bytes;
|
|
struct blkg_rwstat stat_ios;
|
|
};
|
|
|
|
extern struct blkcg_policy blkcg_policy_throtl;
|
|
|
|
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
|
|
{
|
|
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
|
|
}
|
|
|
|
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
|
|
{
|
|
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
|
|
}
|
|
|
|
/*
|
|
* Internal throttling interface
|
|
*/
|
|
#ifndef CONFIG_BLK_DEV_THROTTLING
|
|
static inline void blk_throtl_exit(struct gendisk *disk) { }
|
|
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
|
|
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
|
|
#else /* CONFIG_BLK_DEV_THROTTLING */
|
|
void blk_throtl_exit(struct gendisk *disk);
|
|
bool __blk_throtl_bio(struct bio *bio);
|
|
void blk_throtl_cancel_bios(struct gendisk *disk);
|
|
|
|
static inline bool blk_throtl_activated(struct request_queue *q)
|
|
{
|
|
return q->td != NULL;
|
|
}
|
|
|
|
static inline bool blk_should_throtl(struct bio *bio)
|
|
{
|
|
struct throtl_grp *tg;
|
|
int rw = bio_data_dir(bio);
|
|
|
|
/*
|
|
* This is called under bio_queue_enter(), and it's synchronized with
|
|
* the activation of blk-throtl, which is protected by
|
|
* blk_mq_freeze_queue().
|
|
*/
|
|
if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
|
|
return false;
|
|
|
|
tg = blkg_to_tg(bio->bi_blkg);
|
|
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
|
|
if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
|
|
bio_set_flag(bio, BIO_CGROUP_ACCT);
|
|
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
|
|
bio->bi_iter.bi_size);
|
|
}
|
|
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
|
|
}
|
|
|
|
/* iops limit is always counted */
|
|
if (tg->has_rules_iops[rw])
|
|
return true;
|
|
|
|
if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline bool blk_throtl_bio(struct bio *bio)
|
|
{
|
|
|
|
if (!blk_should_throtl(bio))
|
|
return false;
|
|
|
|
return __blk_throtl_bio(bio);
|
|
}
|
|
#endif /* CONFIG_BLK_DEV_THROTTLING */
|
|
|
|
#endif
|