mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-09-01 15:14:52 +00:00
sched_ext: Move built-in idle CPU selection policy to a separate file
As ext.c is becoming quite large, move the idle CPU selection policy to separate files (ext_idle.c / ext_idle.h) for better code readability. Moreover, group together all the idle CPU selection kfunc's to the same btf_kfunc_id_set block. No functional changes, this is purely code reorganization. Suggested-by: Yury Norov <yury.norov@gmail.com> Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
1626e5ef0b
commit
337d1b354a
@ -21006,8 +21006,7 @@ S: Maintained
|
|||||||
W: https://github.com/sched-ext/scx
|
W: https://github.com/sched-ext/scx
|
||||||
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
|
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
|
||||||
F: include/linux/sched/ext.h
|
F: include/linux/sched/ext.h
|
||||||
F: kernel/sched/ext.h
|
F: kernel/sched/ext*
|
||||||
F: kernel/sched/ext.c
|
|
||||||
F: tools/sched_ext/
|
F: tools/sched_ext/
|
||||||
F: tools/testing/selftests/sched_ext
|
F: tools/testing/selftests/sched_ext
|
||||||
|
|
||||||
|
@ -61,6 +61,7 @@
|
|||||||
|
|
||||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||||
# include "ext.c"
|
# include "ext.c"
|
||||||
|
# include "ext_idle.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "syscalls.c"
|
#include "syscalls.c"
|
||||||
|
@ -6,6 +6,9 @@
|
|||||||
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
||||||
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
||||||
*/
|
*/
|
||||||
|
#include <linux/btf_ids.h>
|
||||||
|
#include "ext_idle.h"
|
||||||
|
|
||||||
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
|
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
|
||||||
|
|
||||||
enum scx_consts {
|
enum scx_consts {
|
||||||
@ -883,12 +886,6 @@ static bool scx_warned_zero_slice;
|
|||||||
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
|
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
|
||||||
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
|
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
|
||||||
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
|
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
|
||||||
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
|
|
||||||
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static struct static_key_false scx_has_op[SCX_OPI_END] =
|
static struct static_key_false scx_has_op[SCX_OPI_END] =
|
||||||
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
|
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
|
||||||
@ -923,21 +920,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
|
|||||||
|
|
||||||
static struct delayed_work scx_watchdog_work;
|
static struct delayed_work scx_watchdog_work;
|
||||||
|
|
||||||
/* idle tracking */
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
#ifdef CONFIG_CPUMASK_OFFSTACK
|
|
||||||
#define CL_ALIGNED_IF_ONSTACK
|
|
||||||
#else
|
|
||||||
#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static struct {
|
|
||||||
cpumask_var_t cpu;
|
|
||||||
cpumask_var_t smt;
|
|
||||||
} idle_masks CL_ALIGNED_IF_ONSTACK;
|
|
||||||
|
|
||||||
#endif /* CONFIG_SMP */
|
|
||||||
|
|
||||||
/* for %SCX_KICK_WAIT */
|
/* for %SCX_KICK_WAIT */
|
||||||
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
|
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
|
||||||
|
|
||||||
@ -3175,416 +3157,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
|
|||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
|
|
||||||
static bool test_and_clear_cpu_idle(int cpu)
|
|
||||||
{
|
|
||||||
#ifdef CONFIG_SCHED_SMT
|
|
||||||
/*
|
|
||||||
* SMT mask should be cleared whether we can claim @cpu or not. The SMT
|
|
||||||
* cluster is not wholly idle either way. This also prevents
|
|
||||||
* scx_pick_idle_cpu() from getting caught in an infinite loop.
|
|
||||||
*/
|
|
||||||
if (sched_smt_active()) {
|
|
||||||
const struct cpumask *smt = cpu_smt_mask(cpu);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If offline, @cpu is not its own sibling and
|
|
||||||
* scx_pick_idle_cpu() can get caught in an infinite loop as
|
|
||||||
* @cpu is never cleared from idle_masks.smt. Ensure that @cpu
|
|
||||||
* is eventually cleared.
|
|
||||||
*
|
|
||||||
* NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
|
|
||||||
* reduce memory writes, which may help alleviate cache
|
|
||||||
* coherence pressure.
|
|
||||||
*/
|
|
||||||
if (cpumask_intersects(smt, idle_masks.smt))
|
|
||||||
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
|
|
||||||
else if (cpumask_test_cpu(cpu, idle_masks.smt))
|
|
||||||
__cpumask_clear_cpu(cpu, idle_masks.smt);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
|
|
||||||
{
|
|
||||||
int cpu;
|
|
||||||
|
|
||||||
retry:
|
|
||||||
if (sched_smt_active()) {
|
|
||||||
cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
|
|
||||||
if (cpu < nr_cpu_ids)
|
|
||||||
goto found;
|
|
||||||
|
|
||||||
if (flags & SCX_PICK_IDLE_CORE)
|
|
||||||
return -EBUSY;
|
|
||||||
}
|
|
||||||
|
|
||||||
cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
|
|
||||||
if (cpu >= nr_cpu_ids)
|
|
||||||
return -EBUSY;
|
|
||||||
|
|
||||||
found:
|
|
||||||
if (test_and_clear_cpu_idle(cpu))
|
|
||||||
return cpu;
|
|
||||||
else
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
|
|
||||||
* domain is not defined).
|
|
||||||
*/
|
|
||||||
static unsigned int llc_weight(s32 cpu)
|
|
||||||
{
|
|
||||||
struct sched_domain *sd;
|
|
||||||
|
|
||||||
sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
||||||
if (!sd)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return sd->span_weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
|
|
||||||
* domain is not defined).
|
|
||||||
*/
|
|
||||||
static struct cpumask *llc_span(s32 cpu)
|
|
||||||
{
|
|
||||||
struct sched_domain *sd;
|
|
||||||
|
|
||||||
sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
||||||
if (!sd)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return sched_domain_span(sd);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
|
|
||||||
* NUMA domain is not defined).
|
|
||||||
*/
|
|
||||||
static unsigned int numa_weight(s32 cpu)
|
|
||||||
{
|
|
||||||
struct sched_domain *sd;
|
|
||||||
struct sched_group *sg;
|
|
||||||
|
|
||||||
sd = rcu_dereference(per_cpu(sd_numa, cpu));
|
|
||||||
if (!sd)
|
|
||||||
return 0;
|
|
||||||
sg = sd->groups;
|
|
||||||
if (!sg)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return sg->group_weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
|
|
||||||
* domain is not defined).
|
|
||||||
*/
|
|
||||||
static struct cpumask *numa_span(s32 cpu)
|
|
||||||
{
|
|
||||||
struct sched_domain *sd;
|
|
||||||
struct sched_group *sg;
|
|
||||||
|
|
||||||
sd = rcu_dereference(per_cpu(sd_numa, cpu));
|
|
||||||
if (!sd)
|
|
||||||
return NULL;
|
|
||||||
sg = sd->groups;
|
|
||||||
if (!sg)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
return sched_group_span(sg);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return true if the LLC domains do not perfectly overlap with the NUMA
|
|
||||||
* domains, false otherwise.
|
|
||||||
*/
|
|
||||||
static bool llc_numa_mismatch(void)
|
|
||||||
{
|
|
||||||
int cpu;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We need to scan all online CPUs to verify whether their scheduling
|
|
||||||
* domains overlap.
|
|
||||||
*
|
|
||||||
* While it is rare to encounter architectures with asymmetric NUMA
|
|
||||||
* topologies, CPU hotplugging or virtualized environments can result
|
|
||||||
* in asymmetric configurations.
|
|
||||||
*
|
|
||||||
* For example:
|
|
||||||
*
|
|
||||||
* NUMA 0:
|
|
||||||
* - LLC 0: cpu0..cpu7
|
|
||||||
* - LLC 1: cpu8..cpu15 [offline]
|
|
||||||
*
|
|
||||||
* NUMA 1:
|
|
||||||
* - LLC 0: cpu16..cpu23
|
|
||||||
* - LLC 1: cpu24..cpu31
|
|
||||||
*
|
|
||||||
* In this case, if we only check the first online CPU (cpu0), we might
|
|
||||||
* incorrectly assume that the LLC and NUMA domains are fully
|
|
||||||
* overlapping, which is incorrect (as NUMA 1 has two distinct LLC
|
|
||||||
* domains).
|
|
||||||
*/
|
|
||||||
for_each_online_cpu(cpu)
|
|
||||||
if (llc_weight(cpu) != numa_weight(cpu))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Initialize topology-aware scheduling.
|
|
||||||
*
|
|
||||||
* Detect if the system has multiple LLC or multiple NUMA domains and enable
|
|
||||||
* cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
|
|
||||||
* selection policy.
|
|
||||||
*
|
|
||||||
* Assumption: the kernel's internal topology representation assumes that each
|
|
||||||
* CPU belongs to a single LLC domain, and that each LLC domain is entirely
|
|
||||||
* contained within a single NUMA node.
|
|
||||||
*/
|
|
||||||
static void update_selcpu_topology(void)
|
|
||||||
{
|
|
||||||
bool enable_llc = false, enable_numa = false;
|
|
||||||
unsigned int nr_cpus;
|
|
||||||
s32 cpu = cpumask_first(cpu_online_mask);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Enable LLC domain optimization only when there are multiple LLC
|
|
||||||
* domains among the online CPUs. If all online CPUs are part of a
|
|
||||||
* single LLC domain, the idle CPU selection logic can choose any
|
|
||||||
* online CPU without bias.
|
|
||||||
*
|
|
||||||
* Note that it is sufficient to check the LLC domain of the first
|
|
||||||
* online CPU to determine whether a single LLC domain includes all
|
|
||||||
* CPUs.
|
|
||||||
*/
|
|
||||||
rcu_read_lock();
|
|
||||||
nr_cpus = llc_weight(cpu);
|
|
||||||
if (nr_cpus > 0) {
|
|
||||||
if (nr_cpus < num_online_cpus())
|
|
||||||
enable_llc = true;
|
|
||||||
pr_debug("sched_ext: LLC=%*pb weight=%u\n",
|
|
||||||
cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Enable NUMA optimization only when there are multiple NUMA domains
|
|
||||||
* among the online CPUs and the NUMA domains don't perfectly overlaps
|
|
||||||
* with the LLC domains.
|
|
||||||
*
|
|
||||||
* If all CPUs belong to the same NUMA node and the same LLC domain,
|
|
||||||
* enabling both NUMA and LLC optimizations is unnecessary, as checking
|
|
||||||
* for an idle CPU in the same domain twice is redundant.
|
|
||||||
*/
|
|
||||||
nr_cpus = numa_weight(cpu);
|
|
||||||
if (nr_cpus > 0) {
|
|
||||||
if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
|
|
||||||
enable_numa = true;
|
|
||||||
pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
|
|
||||||
cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
|
|
||||||
}
|
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
pr_debug("sched_ext: LLC idle selection %s\n",
|
|
||||||
str_enabled_disabled(enable_llc));
|
|
||||||
pr_debug("sched_ext: NUMA idle selection %s\n",
|
|
||||||
str_enabled_disabled(enable_numa));
|
|
||||||
|
|
||||||
if (enable_llc)
|
|
||||||
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
|
|
||||||
else
|
|
||||||
static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
|
|
||||||
if (enable_numa)
|
|
||||||
static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
|
|
||||||
else
|
|
||||||
static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Built-in CPU idle selection policy:
|
|
||||||
*
|
|
||||||
* 1. Prioritize full-idle cores:
|
|
||||||
* - always prioritize CPUs from fully idle cores (both logical CPUs are
|
|
||||||
* idle) to avoid interference caused by SMT.
|
|
||||||
*
|
|
||||||
* 2. Reuse the same CPU:
|
|
||||||
* - prefer the last used CPU to take advantage of cached data (L1, L2) and
|
|
||||||
* branch prediction optimizations.
|
|
||||||
*
|
|
||||||
* 3. Pick a CPU within the same LLC (Last-Level Cache):
|
|
||||||
* - if the above conditions aren't met, pick a CPU that shares the same LLC
|
|
||||||
* to maintain cache locality.
|
|
||||||
*
|
|
||||||
* 4. Pick a CPU within the same NUMA node, if enabled:
|
|
||||||
* - choose a CPU from the same NUMA node to reduce memory access latency.
|
|
||||||
*
|
|
||||||
* 5. Pick any idle CPU usable by the task.
|
|
||||||
*
|
|
||||||
* Step 3 and 4 are performed only if the system has, respectively, multiple
|
|
||||||
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
|
|
||||||
* scx_selcpu_topo_numa).
|
|
||||||
*
|
|
||||||
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
|
|
||||||
* we never call ops.select_cpu() for them, see select_task_rq().
|
|
||||||
*/
|
|
||||||
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
|
|
||||||
u64 wake_flags, bool *found)
|
|
||||||
{
|
|
||||||
const struct cpumask *llc_cpus = NULL;
|
|
||||||
const struct cpumask *numa_cpus = NULL;
|
|
||||||
s32 cpu;
|
|
||||||
|
|
||||||
*found = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This is necessary to protect llc_cpus.
|
|
||||||
*/
|
|
||||||
rcu_read_lock();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Determine the scheduling domain only if the task is allowed to run
|
|
||||||
* on all CPUs.
|
|
||||||
*
|
|
||||||
* This is done primarily for efficiency, as it avoids the overhead of
|
|
||||||
* updating a cpumask every time we need to select an idle CPU (which
|
|
||||||
* can be costly in large SMP systems), but it also aligns logically:
|
|
||||||
* if a task's scheduling domain is restricted by user-space (through
|
|
||||||
* CPU affinity), the task will simply use the flat scheduling domain
|
|
||||||
* defined by user-space.
|
|
||||||
*/
|
|
||||||
if (p->nr_cpus_allowed >= num_possible_cpus()) {
|
|
||||||
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
|
|
||||||
numa_cpus = numa_span(prev_cpu);
|
|
||||||
|
|
||||||
if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
|
|
||||||
llc_cpus = llc_span(prev_cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
|
|
||||||
*/
|
|
||||||
if (wake_flags & SCX_WAKE_SYNC) {
|
|
||||||
cpu = smp_processor_id();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the waker's CPU is cache affine and prev_cpu is idle,
|
|
||||||
* then avoid a migration.
|
|
||||||
*/
|
|
||||||
if (cpus_share_cache(cpu, prev_cpu) &&
|
|
||||||
test_and_clear_cpu_idle(prev_cpu)) {
|
|
||||||
cpu = prev_cpu;
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the waker's local DSQ is empty, and the system is under
|
|
||||||
* utilized, try to wake up @p to the local DSQ of the waker.
|
|
||||||
*
|
|
||||||
* Checking only for an empty local DSQ is insufficient as it
|
|
||||||
* could give the wakee an unfair advantage when the system is
|
|
||||||
* oversaturated.
|
|
||||||
*
|
|
||||||
* Checking only for the presence of idle CPUs is also
|
|
||||||
* insufficient as the local DSQ of the waker could have tasks
|
|
||||||
* piled up on it even if there is an idle core elsewhere on
|
|
||||||
* the system.
|
|
||||||
*/
|
|
||||||
if (!cpumask_empty(idle_masks.cpu) &&
|
|
||||||
!(current->flags & PF_EXITING) &&
|
|
||||||
cpu_rq(cpu)->scx.local_dsq.nr == 0) {
|
|
||||||
if (cpumask_test_cpu(cpu, p->cpus_ptr))
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If CPU has SMT, any wholly idle CPU is likely a better pick than
|
|
||||||
* partially idle @prev_cpu.
|
|
||||||
*/
|
|
||||||
if (sched_smt_active()) {
|
|
||||||
/*
|
|
||||||
* Keep using @prev_cpu if it's part of a fully idle core.
|
|
||||||
*/
|
|
||||||
if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
|
|
||||||
test_and_clear_cpu_idle(prev_cpu)) {
|
|
||||||
cpu = prev_cpu;
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any fully idle core in the same LLC domain.
|
|
||||||
*/
|
|
||||||
if (llc_cpus) {
|
|
||||||
cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any fully idle core in the same NUMA node.
|
|
||||||
*/
|
|
||||||
if (numa_cpus) {
|
|
||||||
cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any full idle core usable by the task.
|
|
||||||
*/
|
|
||||||
cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Use @prev_cpu if it's idle.
|
|
||||||
*/
|
|
||||||
if (test_and_clear_cpu_idle(prev_cpu)) {
|
|
||||||
cpu = prev_cpu;
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any idle CPU in the same LLC domain.
|
|
||||||
*/
|
|
||||||
if (llc_cpus) {
|
|
||||||
cpu = scx_pick_idle_cpu(llc_cpus, 0);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any idle CPU in the same NUMA node.
|
|
||||||
*/
|
|
||||||
if (numa_cpus) {
|
|
||||||
cpu = scx_pick_idle_cpu(numa_cpus, 0);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search for any idle CPU usable by the task.
|
|
||||||
*/
|
|
||||||
cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
|
|
||||||
if (cpu >= 0)
|
|
||||||
goto cpu_found;
|
|
||||||
|
|
||||||
rcu_read_unlock();
|
|
||||||
return prev_cpu;
|
|
||||||
|
|
||||||
cpu_found:
|
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
*found = true;
|
|
||||||
return cpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
|
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -3651,90 +3223,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
|
|||||||
(struct cpumask *)p->cpus_ptr);
|
(struct cpumask *)p->cpus_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reset_idle_masks(void)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Consider all online cpus idle. Should converge to the actual state
|
|
||||||
* quickly.
|
|
||||||
*/
|
|
||||||
cpumask_copy(idle_masks.cpu, cpu_online_mask);
|
|
||||||
cpumask_copy(idle_masks.smt, cpu_online_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void update_builtin_idle(int cpu, bool idle)
|
|
||||||
{
|
|
||||||
assign_cpu(cpu, idle_masks.cpu, idle);
|
|
||||||
|
|
||||||
#ifdef CONFIG_SCHED_SMT
|
|
||||||
if (sched_smt_active()) {
|
|
||||||
const struct cpumask *smt = cpu_smt_mask(cpu);
|
|
||||||
|
|
||||||
if (idle) {
|
|
||||||
/*
|
|
||||||
* idle_masks.smt handling is racy but that's fine as
|
|
||||||
* it's only for optimization and self-correcting.
|
|
||||||
*/
|
|
||||||
if (!cpumask_subset(smt, idle_masks.cpu))
|
|
||||||
return;
|
|
||||||
cpumask_or(idle_masks.smt, idle_masks.smt, smt);
|
|
||||||
} else {
|
|
||||||
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Update the idle state of a CPU to @idle.
|
|
||||||
*
|
|
||||||
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
|
|
||||||
* scheduler of an actual idle state transition (idle to busy or vice
|
|
||||||
* versa). If @do_notify is false, only the idle state in the idle masks is
|
|
||||||
* refreshed without invoking ops.update_idle().
|
|
||||||
*
|
|
||||||
* This distinction is necessary, because an idle CPU can be "reserved" and
|
|
||||||
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
|
|
||||||
* busy even if no tasks are dispatched. In this case, the CPU may return
|
|
||||||
* to idle without a true state transition. Refreshing the idle masks
|
|
||||||
* without invoking ops.update_idle() ensures accurate idle state tracking
|
|
||||||
* while avoiding unnecessary updates and maintaining balanced state
|
|
||||||
* transitions.
|
|
||||||
*/
|
|
||||||
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
|
||||||
{
|
|
||||||
int cpu = cpu_of(rq);
|
|
||||||
|
|
||||||
lockdep_assert_rq_held(rq);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Trigger ops.update_idle() only when transitioning from a task to
|
|
||||||
* the idle thread and vice versa.
|
|
||||||
*
|
|
||||||
* Idle transitions are indicated by do_notify being set to true,
|
|
||||||
* managed by put_prev_task_idle()/set_next_task_idle().
|
|
||||||
*/
|
|
||||||
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
|
|
||||||
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Update the idle masks:
|
|
||||||
* - for real idle transitions (do_notify == true)
|
|
||||||
* - for idle-to-idle transitions (indicated by the previous task
|
|
||||||
* being the idle thread, managed by pick_task_idle())
|
|
||||||
*
|
|
||||||
* Skip updating idle masks if the previous task is not the idle
|
|
||||||
* thread, since set_next_task_idle() has already handled it when
|
|
||||||
* transitioning from a task to the idle thread (calling this
|
|
||||||
* function with do_notify == true).
|
|
||||||
*
|
|
||||||
* In this way we can avoid updating the idle masks twice,
|
|
||||||
* unnecessarily.
|
|
||||||
*/
|
|
||||||
if (static_branch_likely(&scx_builtin_idle_enabled))
|
|
||||||
if (do_notify || is_idle_task(rq->curr))
|
|
||||||
update_builtin_idle(cpu, idle);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void handle_hotplug(struct rq *rq, bool online)
|
static void handle_hotplug(struct rq *rq, bool online)
|
||||||
{
|
{
|
||||||
int cpu = cpu_of(rq);
|
int cpu = cpu_of(rq);
|
||||||
@ -3742,7 +3230,7 @@ static void handle_hotplug(struct rq *rq, bool online)
|
|||||||
atomic_long_inc(&scx_hotplug_seq);
|
atomic_long_inc(&scx_hotplug_seq);
|
||||||
|
|
||||||
if (scx_enabled())
|
if (scx_enabled())
|
||||||
update_selcpu_topology();
|
scx_idle_update_selcpu_topology();
|
||||||
|
|
||||||
if (online && SCX_HAS_OP(cpu_online))
|
if (online && SCX_HAS_OP(cpu_online))
|
||||||
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
|
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
|
||||||
@ -3774,12 +3262,6 @@ static void rq_offline_scx(struct rq *rq)
|
|||||||
rq->scx.flags &= ~SCX_RQ_ONLINE;
|
rq->scx.flags &= ~SCX_RQ_ONLINE;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* CONFIG_SMP */
|
|
||||||
|
|
||||||
static bool test_and_clear_cpu_idle(int cpu) { return false; }
|
|
||||||
static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
|
|
||||||
static void reset_idle_masks(void) {}
|
|
||||||
|
|
||||||
#endif /* CONFIG_SMP */
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
static bool check_rq_for_timeouts(struct rq *rq)
|
static bool check_rq_for_timeouts(struct rq *rq)
|
||||||
@ -5615,9 +5097,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||||||
static_branch_enable_cpuslocked(&scx_has_op[i]);
|
static_branch_enable_cpuslocked(&scx_has_op[i]);
|
||||||
|
|
||||||
check_hotplug_seq(ops);
|
check_hotplug_seq(ops);
|
||||||
#ifdef CONFIG_SMP
|
scx_idle_update_selcpu_topology();
|
||||||
update_selcpu_topology();
|
|
||||||
#endif
|
|
||||||
cpus_read_unlock();
|
cpus_read_unlock();
|
||||||
|
|
||||||
ret = validate_ops(ops);
|
ret = validate_ops(ops);
|
||||||
@ -5665,7 +5146,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||||||
static_branch_enable(&scx_ops_cpu_preempt);
|
static_branch_enable(&scx_ops_cpu_preempt);
|
||||||
|
|
||||||
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
|
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
|
||||||
reset_idle_masks();
|
scx_idle_reset_masks();
|
||||||
static_branch_enable(&scx_builtin_idle_enabled);
|
static_branch_enable(&scx_builtin_idle_enabled);
|
||||||
} else {
|
} else {
|
||||||
static_branch_disable(&scx_builtin_idle_enabled);
|
static_branch_disable(&scx_builtin_idle_enabled);
|
||||||
@ -6308,10 +5789,8 @@ void __init init_sched_ext_class(void)
|
|||||||
SCX_TG_ONLINE);
|
SCX_TG_ONLINE);
|
||||||
|
|
||||||
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
|
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
|
||||||
#ifdef CONFIG_SMP
|
scx_idle_init_masks();
|
||||||
BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
|
|
||||||
BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
|
|
||||||
#endif
|
|
||||||
scx_kick_cpus_pnt_seqs =
|
scx_kick_cpus_pnt_seqs =
|
||||||
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
|
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
|
||||||
__alignof__(scx_kick_cpus_pnt_seqs[0]));
|
__alignof__(scx_kick_cpus_pnt_seqs[0]));
|
||||||
@ -6344,62 +5823,6 @@ void __init init_sched_ext_class(void)
|
|||||||
/********************************************************************************
|
/********************************************************************************
|
||||||
* Helpers that can be called from the BPF scheduler.
|
* Helpers that can be called from the BPF scheduler.
|
||||||
*/
|
*/
|
||||||
#include <linux/btf_ids.h>
|
|
||||||
|
|
||||||
__bpf_kfunc_start_defs();
|
|
||||||
|
|
||||||
static bool check_builtin_idle_enabled(void)
|
|
||||||
{
|
|
||||||
if (static_branch_likely(&scx_builtin_idle_enabled))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
scx_ops_error("built-in idle tracking is disabled");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
|
|
||||||
* @p: task_struct to select a CPU for
|
|
||||||
* @prev_cpu: CPU @p was on previously
|
|
||||||
* @wake_flags: %SCX_WAKE_* flags
|
|
||||||
* @is_idle: out parameter indicating whether the returned CPU is idle
|
|
||||||
*
|
|
||||||
* Can only be called from ops.select_cpu() if the built-in CPU selection is
|
|
||||||
* enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
|
|
||||||
* @p, @prev_cpu and @wake_flags match ops.select_cpu().
|
|
||||||
*
|
|
||||||
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
|
|
||||||
* currently idle and thus a good candidate for direct dispatching.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
|
|
||||||
u64 wake_flags, bool *is_idle)
|
|
||||||
{
|
|
||||||
if (!check_builtin_idle_enabled())
|
|
||||||
goto prev_cpu;
|
|
||||||
|
|
||||||
if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
|
|
||||||
goto prev_cpu;
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
prev_cpu:
|
|
||||||
*is_idle = false;
|
|
||||||
return prev_cpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
__bpf_kfunc_end_defs();
|
|
||||||
|
|
||||||
BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
|
|
||||||
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
|
|
||||||
BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
|
|
||||||
|
|
||||||
static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
|
|
||||||
.owner = THIS_MODULE,
|
|
||||||
.set = &scx_kfunc_ids_select_cpu,
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
|
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
|
||||||
{
|
{
|
||||||
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
|
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
|
||||||
@ -7458,142 +6881,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
|
|
||||||
* per-CPU cpumask.
|
|
||||||
*
|
|
||||||
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
|
|
||||||
{
|
|
||||||
if (!check_builtin_idle_enabled())
|
|
||||||
return cpu_none_mask;
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
return idle_masks.cpu;
|
|
||||||
#else
|
|
||||||
return cpu_none_mask;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
|
|
||||||
* per-physical-core cpumask. Can be used to determine if an entire physical
|
|
||||||
* core is free.
|
|
||||||
*
|
|
||||||
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
|
|
||||||
{
|
|
||||||
if (!check_builtin_idle_enabled())
|
|
||||||
return cpu_none_mask;
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
if (sched_smt_active())
|
|
||||||
return idle_masks.smt;
|
|
||||||
else
|
|
||||||
return idle_masks.cpu;
|
|
||||||
#else
|
|
||||||
return cpu_none_mask;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
|
|
||||||
* either the percpu, or SMT idle-tracking cpumask.
|
|
||||||
* @idle_mask: &cpumask to use
|
|
||||||
*/
|
|
||||||
__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Empty function body because we aren't actually acquiring or releasing
|
|
||||||
* a reference to a global idle cpumask, which is read-only in the
|
|
||||||
* caller and is never released. The acquire / release semantics here
|
|
||||||
* are just used to make the cpumask a trusted pointer in the caller.
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
|
|
||||||
* @cpu: cpu to test and clear idle for
|
|
||||||
*
|
|
||||||
* Returns %true if @cpu was idle and its idle state was successfully cleared.
|
|
||||||
* %false otherwise.
|
|
||||||
*
|
|
||||||
* Unavailable if ops.update_idle() is implemented and
|
|
||||||
* %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
|
|
||||||
{
|
|
||||||
if (!check_builtin_idle_enabled())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (ops_cpu_valid(cpu, NULL))
|
|
||||||
return test_and_clear_cpu_idle(cpu);
|
|
||||||
else
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
|
|
||||||
* @cpus_allowed: Allowed cpumask
|
|
||||||
* @flags: %SCX_PICK_IDLE_CPU_* flags
|
|
||||||
*
|
|
||||||
* Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
|
|
||||||
* number on success. -%EBUSY if no matching cpu was found.
|
|
||||||
*
|
|
||||||
* Idle CPU tracking may race against CPU scheduling state transitions. For
|
|
||||||
* example, this function may return -%EBUSY as CPUs are transitioning into the
|
|
||||||
* idle state. If the caller then assumes that there will be dispatch events on
|
|
||||||
* the CPUs as they were all busy, the scheduler may end up stalling with CPUs
|
|
||||||
* idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
|
|
||||||
* scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
|
|
||||||
* event in the near future.
|
|
||||||
*
|
|
||||||
* Unavailable if ops.update_idle() is implemented and
|
|
||||||
* %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
|
|
||||||
u64 flags)
|
|
||||||
{
|
|
||||||
if (!check_builtin_idle_enabled())
|
|
||||||
return -EBUSY;
|
|
||||||
|
|
||||||
return scx_pick_idle_cpu(cpus_allowed, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
|
|
||||||
* @cpus_allowed: Allowed cpumask
|
|
||||||
* @flags: %SCX_PICK_IDLE_CPU_* flags
|
|
||||||
*
|
|
||||||
* Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
|
|
||||||
* CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
|
|
||||||
* number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
|
|
||||||
* empty.
|
|
||||||
*
|
|
||||||
* If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
|
|
||||||
* set, this function can't tell which CPUs are idle and will always pick any
|
|
||||||
* CPU.
|
|
||||||
*/
|
|
||||||
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
|
|
||||||
u64 flags)
|
|
||||||
{
|
|
||||||
s32 cpu;
|
|
||||||
|
|
||||||
if (static_branch_likely(&scx_builtin_idle_enabled)) {
|
|
||||||
cpu = scx_pick_idle_cpu(cpus_allowed, flags);
|
|
||||||
if (cpu >= 0)
|
|
||||||
return cpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
cpu = cpumask_any_distribute(cpus_allowed);
|
|
||||||
if (cpu < nr_cpu_ids)
|
|
||||||
return cpu;
|
|
||||||
else
|
|
||||||
return -EBUSY;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* scx_bpf_task_running - Is task currently running?
|
* scx_bpf_task_running - Is task currently running?
|
||||||
* @p: task of interest
|
* @p: task of interest
|
||||||
@ -7769,8 +7056,6 @@ static int __init scx_init(void)
|
|||||||
* check using scx_kf_allowed().
|
* check using scx_kf_allowed().
|
||||||
*/
|
*/
|
||||||
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
||||||
&scx_kfunc_set_select_cpu)) ||
|
|
||||||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
|
||||||
&scx_kfunc_set_enqueue_dispatch)) ||
|
&scx_kfunc_set_enqueue_dispatch)) ||
|
||||||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
||||||
&scx_kfunc_set_dispatch)) ||
|
&scx_kfunc_set_dispatch)) ||
|
||||||
@ -7790,6 +7075,12 @@ static int __init scx_init(void)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = scx_idle_init();
|
||||||
|
if (ret) {
|
||||||
|
pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
|
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
|
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
|
||||||
|
752
kernel/sched/ext_idle.c
Normal file
752
kernel/sched/ext_idle.c
Normal file
@ -0,0 +1,752 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
|
||||||
|
*
|
||||||
|
* Built-in idle CPU tracking policy.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
||||||
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
||||||
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
||||||
|
* Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
|
||||||
|
*/
|
||||||
|
#include "ext_idle.h"
|
||||||
|
|
||||||
|
/* Enable/disable built-in idle CPU selection policy */
|
||||||
|
DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
#ifdef CONFIG_CPUMASK_OFFSTACK
|
||||||
|
#define CL_ALIGNED_IF_ONSTACK
|
||||||
|
#else
|
||||||
|
#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Enable/disable LLC aware optimizations */
|
||||||
|
DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
|
||||||
|
|
||||||
|
/* Enable/disable NUMA aware optimizations */
|
||||||
|
DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
|
||||||
|
|
||||||
|
static struct {
|
||||||
|
cpumask_var_t cpu;
|
||||||
|
cpumask_var_t smt;
|
||||||
|
} idle_masks CL_ALIGNED_IF_ONSTACK;
|
||||||
|
|
||||||
|
bool scx_idle_test_and_clear_cpu(int cpu)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_SCHED_SMT
|
||||||
|
/*
|
||||||
|
* SMT mask should be cleared whether we can claim @cpu or not. The SMT
|
||||||
|
* cluster is not wholly idle either way. This also prevents
|
||||||
|
* scx_pick_idle_cpu() from getting caught in an infinite loop.
|
||||||
|
*/
|
||||||
|
if (sched_smt_active()) {
|
||||||
|
const struct cpumask *smt = cpu_smt_mask(cpu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If offline, @cpu is not its own sibling and
|
||||||
|
* scx_pick_idle_cpu() can get caught in an infinite loop as
|
||||||
|
* @cpu is never cleared from idle_masks.smt. Ensure that @cpu
|
||||||
|
* is eventually cleared.
|
||||||
|
*
|
||||||
|
* NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
|
||||||
|
* reduce memory writes, which may help alleviate cache
|
||||||
|
* coherence pressure.
|
||||||
|
*/
|
||||||
|
if (cpumask_intersects(smt, idle_masks.smt))
|
||||||
|
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
|
||||||
|
else if (cpumask_test_cpu(cpu, idle_masks.smt))
|
||||||
|
__cpumask_clear_cpu(cpu, idle_masks.smt);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
|
||||||
|
{
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
retry:
|
||||||
|
if (sched_smt_active()) {
|
||||||
|
cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
|
||||||
|
if (cpu < nr_cpu_ids)
|
||||||
|
goto found;
|
||||||
|
|
||||||
|
if (flags & SCX_PICK_IDLE_CORE)
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
|
cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
|
||||||
|
if (cpu >= nr_cpu_ids)
|
||||||
|
return -EBUSY;
|
||||||
|
|
||||||
|
found:
|
||||||
|
if (scx_idle_test_and_clear_cpu(cpu))
|
||||||
|
return cpu;
|
||||||
|
else
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
|
||||||
|
* domain is not defined).
|
||||||
|
*/
|
||||||
|
static unsigned int llc_weight(s32 cpu)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd;
|
||||||
|
|
||||||
|
sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
||||||
|
if (!sd)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return sd->span_weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
|
||||||
|
* domain is not defined).
|
||||||
|
*/
|
||||||
|
static struct cpumask *llc_span(s32 cpu)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd;
|
||||||
|
|
||||||
|
sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
||||||
|
if (!sd)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return sched_domain_span(sd);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
|
||||||
|
* NUMA domain is not defined).
|
||||||
|
*/
|
||||||
|
static unsigned int numa_weight(s32 cpu)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd;
|
||||||
|
struct sched_group *sg;
|
||||||
|
|
||||||
|
sd = rcu_dereference(per_cpu(sd_numa, cpu));
|
||||||
|
if (!sd)
|
||||||
|
return 0;
|
||||||
|
sg = sd->groups;
|
||||||
|
if (!sg)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return sg->group_weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
|
||||||
|
* domain is not defined).
|
||||||
|
*/
|
||||||
|
static struct cpumask *numa_span(s32 cpu)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd;
|
||||||
|
struct sched_group *sg;
|
||||||
|
|
||||||
|
sd = rcu_dereference(per_cpu(sd_numa, cpu));
|
||||||
|
if (!sd)
|
||||||
|
return NULL;
|
||||||
|
sg = sd->groups;
|
||||||
|
if (!sg)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return sched_group_span(sg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return true if the LLC domains do not perfectly overlap with the NUMA
|
||||||
|
* domains, false otherwise.
|
||||||
|
*/
|
||||||
|
static bool llc_numa_mismatch(void)
|
||||||
|
{
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to scan all online CPUs to verify whether their scheduling
|
||||||
|
* domains overlap.
|
||||||
|
*
|
||||||
|
* While it is rare to encounter architectures with asymmetric NUMA
|
||||||
|
* topologies, CPU hotplugging or virtualized environments can result
|
||||||
|
* in asymmetric configurations.
|
||||||
|
*
|
||||||
|
* For example:
|
||||||
|
*
|
||||||
|
* NUMA 0:
|
||||||
|
* - LLC 0: cpu0..cpu7
|
||||||
|
* - LLC 1: cpu8..cpu15 [offline]
|
||||||
|
*
|
||||||
|
* NUMA 1:
|
||||||
|
* - LLC 0: cpu16..cpu23
|
||||||
|
* - LLC 1: cpu24..cpu31
|
||||||
|
*
|
||||||
|
* In this case, if we only check the first online CPU (cpu0), we might
|
||||||
|
* incorrectly assume that the LLC and NUMA domains are fully
|
||||||
|
* overlapping, which is incorrect (as NUMA 1 has two distinct LLC
|
||||||
|
* domains).
|
||||||
|
*/
|
||||||
|
for_each_online_cpu(cpu)
|
||||||
|
if (llc_weight(cpu) != numa_weight(cpu))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize topology-aware scheduling.
|
||||||
|
*
|
||||||
|
* Detect if the system has multiple LLC or multiple NUMA domains and enable
|
||||||
|
* cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
|
||||||
|
* selection policy.
|
||||||
|
*
|
||||||
|
* Assumption: the kernel's internal topology representation assumes that each
|
||||||
|
* CPU belongs to a single LLC domain, and that each LLC domain is entirely
|
||||||
|
* contained within a single NUMA node.
|
||||||
|
*/
|
||||||
|
void scx_idle_update_selcpu_topology(void)
|
||||||
|
{
|
||||||
|
bool enable_llc = false, enable_numa = false;
|
||||||
|
unsigned int nr_cpus;
|
||||||
|
s32 cpu = cpumask_first(cpu_online_mask);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Enable LLC domain optimization only when there are multiple LLC
|
||||||
|
* domains among the online CPUs. If all online CPUs are part of a
|
||||||
|
* single LLC domain, the idle CPU selection logic can choose any
|
||||||
|
* online CPU without bias.
|
||||||
|
*
|
||||||
|
* Note that it is sufficient to check the LLC domain of the first
|
||||||
|
* online CPU to determine whether a single LLC domain includes all
|
||||||
|
* CPUs.
|
||||||
|
*/
|
||||||
|
rcu_read_lock();
|
||||||
|
nr_cpus = llc_weight(cpu);
|
||||||
|
if (nr_cpus > 0) {
|
||||||
|
if (nr_cpus < num_online_cpus())
|
||||||
|
enable_llc = true;
|
||||||
|
pr_debug("sched_ext: LLC=%*pb weight=%u\n",
|
||||||
|
cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Enable NUMA optimization only when there are multiple NUMA domains
|
||||||
|
* among the online CPUs and the NUMA domains don't perfectly overlaps
|
||||||
|
* with the LLC domains.
|
||||||
|
*
|
||||||
|
* If all CPUs belong to the same NUMA node and the same LLC domain,
|
||||||
|
* enabling both NUMA and LLC optimizations is unnecessary, as checking
|
||||||
|
* for an idle CPU in the same domain twice is redundant.
|
||||||
|
*/
|
||||||
|
nr_cpus = numa_weight(cpu);
|
||||||
|
if (nr_cpus > 0) {
|
||||||
|
if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
|
||||||
|
enable_numa = true;
|
||||||
|
pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
|
||||||
|
cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
pr_debug("sched_ext: LLC idle selection %s\n",
|
||||||
|
str_enabled_disabled(enable_llc));
|
||||||
|
pr_debug("sched_ext: NUMA idle selection %s\n",
|
||||||
|
str_enabled_disabled(enable_numa));
|
||||||
|
|
||||||
|
if (enable_llc)
|
||||||
|
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
|
||||||
|
else
|
||||||
|
static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
|
||||||
|
if (enable_numa)
|
||||||
|
static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
|
||||||
|
else
|
||||||
|
static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Built-in CPU idle selection policy:
|
||||||
|
*
|
||||||
|
* 1. Prioritize full-idle cores:
|
||||||
|
* - always prioritize CPUs from fully idle cores (both logical CPUs are
|
||||||
|
* idle) to avoid interference caused by SMT.
|
||||||
|
*
|
||||||
|
* 2. Reuse the same CPU:
|
||||||
|
* - prefer the last used CPU to take advantage of cached data (L1, L2) and
|
||||||
|
* branch prediction optimizations.
|
||||||
|
*
|
||||||
|
* 3. Pick a CPU within the same LLC (Last-Level Cache):
|
||||||
|
* - if the above conditions aren't met, pick a CPU that shares the same LLC
|
||||||
|
* to maintain cache locality.
|
||||||
|
*
|
||||||
|
* 4. Pick a CPU within the same NUMA node, if enabled:
|
||||||
|
* - choose a CPU from the same NUMA node to reduce memory access latency.
|
||||||
|
*
|
||||||
|
* 5. Pick any idle CPU usable by the task.
|
||||||
|
*
|
||||||
|
* Step 3 and 4 are performed only if the system has, respectively, multiple
|
||||||
|
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
|
||||||
|
* scx_selcpu_topo_numa).
|
||||||
|
*
|
||||||
|
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
|
||||||
|
* we never call ops.select_cpu() for them, see select_task_rq().
|
||||||
|
*/
|
||||||
|
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found)
|
||||||
|
{
|
||||||
|
const struct cpumask *llc_cpus = NULL;
|
||||||
|
const struct cpumask *numa_cpus = NULL;
|
||||||
|
s32 cpu;
|
||||||
|
|
||||||
|
*found = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is necessary to protect llc_cpus.
|
||||||
|
*/
|
||||||
|
rcu_read_lock();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine the scheduling domain only if the task is allowed to run
|
||||||
|
* on all CPUs.
|
||||||
|
*
|
||||||
|
* This is done primarily for efficiency, as it avoids the overhead of
|
||||||
|
* updating a cpumask every time we need to select an idle CPU (which
|
||||||
|
* can be costly in large SMP systems), but it also aligns logically:
|
||||||
|
* if a task's scheduling domain is restricted by user-space (through
|
||||||
|
* CPU affinity), the task will simply use the flat scheduling domain
|
||||||
|
* defined by user-space.
|
||||||
|
*/
|
||||||
|
if (p->nr_cpus_allowed >= num_possible_cpus()) {
|
||||||
|
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
|
||||||
|
numa_cpus = numa_span(prev_cpu);
|
||||||
|
|
||||||
|
if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
|
||||||
|
llc_cpus = llc_span(prev_cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
|
||||||
|
*/
|
||||||
|
if (wake_flags & SCX_WAKE_SYNC) {
|
||||||
|
cpu = smp_processor_id();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the waker's CPU is cache affine and prev_cpu is idle,
|
||||||
|
* then avoid a migration.
|
||||||
|
*/
|
||||||
|
if (cpus_share_cache(cpu, prev_cpu) &&
|
||||||
|
scx_idle_test_and_clear_cpu(prev_cpu)) {
|
||||||
|
cpu = prev_cpu;
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the waker's local DSQ is empty, and the system is under
|
||||||
|
* utilized, try to wake up @p to the local DSQ of the waker.
|
||||||
|
*
|
||||||
|
* Checking only for an empty local DSQ is insufficient as it
|
||||||
|
* could give the wakee an unfair advantage when the system is
|
||||||
|
* oversaturated.
|
||||||
|
*
|
||||||
|
* Checking only for the presence of idle CPUs is also
|
||||||
|
* insufficient as the local DSQ of the waker could have tasks
|
||||||
|
* piled up on it even if there is an idle core elsewhere on
|
||||||
|
* the system.
|
||||||
|
*/
|
||||||
|
if (!cpumask_empty(idle_masks.cpu) &&
|
||||||
|
!(current->flags & PF_EXITING) &&
|
||||||
|
cpu_rq(cpu)->scx.local_dsq.nr == 0) {
|
||||||
|
if (cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If CPU has SMT, any wholly idle CPU is likely a better pick than
|
||||||
|
* partially idle @prev_cpu.
|
||||||
|
*/
|
||||||
|
if (sched_smt_active()) {
|
||||||
|
/*
|
||||||
|
* Keep using @prev_cpu if it's part of a fully idle core.
|
||||||
|
*/
|
||||||
|
if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
|
||||||
|
scx_idle_test_and_clear_cpu(prev_cpu)) {
|
||||||
|
cpu = prev_cpu;
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any fully idle core in the same LLC domain.
|
||||||
|
*/
|
||||||
|
if (llc_cpus) {
|
||||||
|
cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any fully idle core in the same NUMA node.
|
||||||
|
*/
|
||||||
|
if (numa_cpus) {
|
||||||
|
cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any full idle core usable by the task.
|
||||||
|
*/
|
||||||
|
cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use @prev_cpu if it's idle.
|
||||||
|
*/
|
||||||
|
if (scx_idle_test_and_clear_cpu(prev_cpu)) {
|
||||||
|
cpu = prev_cpu;
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any idle CPU in the same LLC domain.
|
||||||
|
*/
|
||||||
|
if (llc_cpus) {
|
||||||
|
cpu = scx_pick_idle_cpu(llc_cpus, 0);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any idle CPU in the same NUMA node.
|
||||||
|
*/
|
||||||
|
if (numa_cpus) {
|
||||||
|
cpu = scx_pick_idle_cpu(numa_cpus, 0);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for any idle CPU usable by the task.
|
||||||
|
*/
|
||||||
|
cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
|
||||||
|
if (cpu >= 0)
|
||||||
|
goto cpu_found;
|
||||||
|
|
||||||
|
rcu_read_unlock();
|
||||||
|
return prev_cpu;
|
||||||
|
|
||||||
|
cpu_found:
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
*found = true;
|
||||||
|
return cpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
void scx_idle_reset_masks(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Consider all online cpus idle. Should converge to the actual state
|
||||||
|
* quickly.
|
||||||
|
*/
|
||||||
|
cpumask_copy(idle_masks.cpu, cpu_online_mask);
|
||||||
|
cpumask_copy(idle_masks.smt, cpu_online_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
void scx_idle_init_masks(void)
|
||||||
|
{
|
||||||
|
BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
|
||||||
|
BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void update_builtin_idle(int cpu, bool idle)
|
||||||
|
{
|
||||||
|
assign_cpu(cpu, idle_masks.cpu, idle);
|
||||||
|
|
||||||
|
#ifdef CONFIG_SCHED_SMT
|
||||||
|
if (sched_smt_active()) {
|
||||||
|
const struct cpumask *smt = cpu_smt_mask(cpu);
|
||||||
|
|
||||||
|
if (idle) {
|
||||||
|
/*
|
||||||
|
* idle_masks.smt handling is racy but that's fine as
|
||||||
|
* it's only for optimization and self-correcting.
|
||||||
|
*/
|
||||||
|
if (!cpumask_subset(smt, idle_masks.cpu))
|
||||||
|
return;
|
||||||
|
cpumask_or(idle_masks.smt, idle_masks.smt, smt);
|
||||||
|
} else {
|
||||||
|
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the idle state of a CPU to @idle.
|
||||||
|
*
|
||||||
|
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
|
||||||
|
* scheduler of an actual idle state transition (idle to busy or vice
|
||||||
|
* versa). If @do_notify is false, only the idle state in the idle masks is
|
||||||
|
* refreshed without invoking ops.update_idle().
|
||||||
|
*
|
||||||
|
* This distinction is necessary, because an idle CPU can be "reserved" and
|
||||||
|
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
|
||||||
|
* busy even if no tasks are dispatched. In this case, the CPU may return
|
||||||
|
* to idle without a true state transition. Refreshing the idle masks
|
||||||
|
* without invoking ops.update_idle() ensures accurate idle state tracking
|
||||||
|
* while avoiding unnecessary updates and maintaining balanced state
|
||||||
|
* transitions.
|
||||||
|
*/
|
||||||
|
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
||||||
|
{
|
||||||
|
int cpu = cpu_of(rq);
|
||||||
|
|
||||||
|
lockdep_assert_rq_held(rq);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Trigger ops.update_idle() only when transitioning from a task to
|
||||||
|
* the idle thread and vice versa.
|
||||||
|
*
|
||||||
|
* Idle transitions are indicated by do_notify being set to true,
|
||||||
|
* managed by put_prev_task_idle()/set_next_task_idle().
|
||||||
|
*/
|
||||||
|
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
|
||||||
|
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the idle masks:
|
||||||
|
* - for real idle transitions (do_notify == true)
|
||||||
|
* - for idle-to-idle transitions (indicated by the previous task
|
||||||
|
* being the idle thread, managed by pick_task_idle())
|
||||||
|
*
|
||||||
|
* Skip updating idle masks if the previous task is not the idle
|
||||||
|
* thread, since set_next_task_idle() has already handled it when
|
||||||
|
* transitioning from a task to the idle thread (calling this
|
||||||
|
* function with do_notify == true).
|
||||||
|
*
|
||||||
|
* In this way we can avoid updating the idle masks twice,
|
||||||
|
* unnecessarily.
|
||||||
|
*/
|
||||||
|
if (static_branch_likely(&scx_builtin_idle_enabled))
|
||||||
|
if (do_notify || is_idle_task(rq->curr))
|
||||||
|
update_builtin_idle(cpu, idle);
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
|
/********************************************************************************
|
||||||
|
* Helpers that can be called from the BPF scheduler.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc_start_defs();
|
||||||
|
|
||||||
|
static bool check_builtin_idle_enabled(void)
|
||||||
|
{
|
||||||
|
if (static_branch_likely(&scx_builtin_idle_enabled))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
scx_ops_error("built-in idle tracking is disabled");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
|
||||||
|
* @p: task_struct to select a CPU for
|
||||||
|
* @prev_cpu: CPU @p was on previously
|
||||||
|
* @wake_flags: %SCX_WAKE_* flags
|
||||||
|
* @is_idle: out parameter indicating whether the returned CPU is idle
|
||||||
|
*
|
||||||
|
* Can only be called from ops.select_cpu() if the built-in CPU selection is
|
||||||
|
* enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
|
||||||
|
* @p, @prev_cpu and @wake_flags match ops.select_cpu().
|
||||||
|
*
|
||||||
|
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
|
||||||
|
* currently idle and thus a good candidate for direct dispatching.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
|
||||||
|
u64 wake_flags, bool *is_idle)
|
||||||
|
{
|
||||||
|
if (!check_builtin_idle_enabled())
|
||||||
|
goto prev_cpu;
|
||||||
|
|
||||||
|
if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
|
||||||
|
goto prev_cpu;
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
prev_cpu:
|
||||||
|
*is_idle = false;
|
||||||
|
return prev_cpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
|
||||||
|
* per-CPU cpumask.
|
||||||
|
*
|
||||||
|
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
|
||||||
|
{
|
||||||
|
if (!check_builtin_idle_enabled())
|
||||||
|
return cpu_none_mask;
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
return idle_masks.cpu;
|
||||||
|
#else
|
||||||
|
return cpu_none_mask;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
|
||||||
|
* per-physical-core cpumask. Can be used to determine if an entire physical
|
||||||
|
* core is free.
|
||||||
|
*
|
||||||
|
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
|
||||||
|
{
|
||||||
|
if (!check_builtin_idle_enabled())
|
||||||
|
return cpu_none_mask;
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
if (sched_smt_active())
|
||||||
|
return idle_masks.smt;
|
||||||
|
else
|
||||||
|
return idle_masks.cpu;
|
||||||
|
#else
|
||||||
|
return cpu_none_mask;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
|
||||||
|
* either the percpu, or SMT idle-tracking cpumask.
|
||||||
|
* @idle_mask: &cpumask to use
|
||||||
|
*/
|
||||||
|
__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Empty function body because we aren't actually acquiring or releasing
|
||||||
|
* a reference to a global idle cpumask, which is read-only in the
|
||||||
|
* caller and is never released. The acquire / release semantics here
|
||||||
|
* are just used to make the cpumask a trusted pointer in the caller.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
|
||||||
|
* @cpu: cpu to test and clear idle for
|
||||||
|
*
|
||||||
|
* Returns %true if @cpu was idle and its idle state was successfully cleared.
|
||||||
|
* %false otherwise.
|
||||||
|
*
|
||||||
|
* Unavailable if ops.update_idle() is implemented and
|
||||||
|
* %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
|
||||||
|
{
|
||||||
|
if (!check_builtin_idle_enabled())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (ops_cpu_valid(cpu, NULL))
|
||||||
|
return scx_idle_test_and_clear_cpu(cpu);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
|
||||||
|
* @cpus_allowed: Allowed cpumask
|
||||||
|
* @flags: %SCX_PICK_IDLE_CPU_* flags
|
||||||
|
*
|
||||||
|
* Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
|
||||||
|
* number on success. -%EBUSY if no matching cpu was found.
|
||||||
|
*
|
||||||
|
* Idle CPU tracking may race against CPU scheduling state transitions. For
|
||||||
|
* example, this function may return -%EBUSY as CPUs are transitioning into the
|
||||||
|
* idle state. If the caller then assumes that there will be dispatch events on
|
||||||
|
* the CPUs as they were all busy, the scheduler may end up stalling with CPUs
|
||||||
|
* idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
|
||||||
|
* scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
|
||||||
|
* event in the near future.
|
||||||
|
*
|
||||||
|
* Unavailable if ops.update_idle() is implemented and
|
||||||
|
* %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
|
||||||
|
u64 flags)
|
||||||
|
{
|
||||||
|
if (!check_builtin_idle_enabled())
|
||||||
|
return -EBUSY;
|
||||||
|
|
||||||
|
return scx_pick_idle_cpu(cpus_allowed, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
|
||||||
|
* @cpus_allowed: Allowed cpumask
|
||||||
|
* @flags: %SCX_PICK_IDLE_CPU_* flags
|
||||||
|
*
|
||||||
|
* Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
|
||||||
|
* CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
|
||||||
|
* number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
|
||||||
|
* empty.
|
||||||
|
*
|
||||||
|
* If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
|
||||||
|
* set, this function can't tell which CPUs are idle and will always pick any
|
||||||
|
* CPU.
|
||||||
|
*/
|
||||||
|
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
|
||||||
|
u64 flags)
|
||||||
|
{
|
||||||
|
s32 cpu;
|
||||||
|
|
||||||
|
if (static_branch_likely(&scx_builtin_idle_enabled)) {
|
||||||
|
cpu = scx_pick_idle_cpu(cpus_allowed, flags);
|
||||||
|
if (cpu >= 0)
|
||||||
|
return cpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
cpu = cpumask_any_distribute(cpus_allowed);
|
||||||
|
if (cpu < nr_cpu_ids)
|
||||||
|
return cpu;
|
||||||
|
else
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
|
__bpf_kfunc_end_defs();
|
||||||
|
|
||||||
|
BTF_KFUNCS_START(scx_kfunc_ids_idle)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
|
||||||
|
BTF_KFUNCS_END(scx_kfunc_ids_idle)
|
||||||
|
|
||||||
|
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
|
||||||
|
.owner = THIS_MODULE,
|
||||||
|
.set = &scx_kfunc_ids_idle,
|
||||||
|
};
|
||||||
|
|
||||||
|
BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
|
||||||
|
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
|
||||||
|
BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
|
||||||
|
|
||||||
|
static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
|
||||||
|
.owner = THIS_MODULE,
|
||||||
|
.set = &scx_kfunc_ids_select_cpu,
|
||||||
|
};
|
||||||
|
|
||||||
|
int scx_idle_init(void)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
|
||||||
|
register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
|
||||||
|
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
|
||||||
|
register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
39
kernel/sched/ext_idle.h
Normal file
39
kernel/sched/ext_idle.h
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
|
||||||
|
*
|
||||||
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
||||||
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
||||||
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
||||||
|
* Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
|
||||||
|
*/
|
||||||
|
#ifndef _KERNEL_SCHED_EXT_IDLE_H
|
||||||
|
#define _KERNEL_SCHED_EXT_IDLE_H
|
||||||
|
|
||||||
|
extern struct static_key_false scx_builtin_idle_enabled;
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
extern struct static_key_false scx_selcpu_topo_llc;
|
||||||
|
extern struct static_key_false scx_selcpu_topo_numa;
|
||||||
|
|
||||||
|
void scx_idle_update_selcpu_topology(void);
|
||||||
|
void scx_idle_reset_masks(void);
|
||||||
|
void scx_idle_init_masks(void);
|
||||||
|
bool scx_idle_test_and_clear_cpu(int cpu);
|
||||||
|
s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags);
|
||||||
|
#else /* !CONFIG_SMP */
|
||||||
|
static inline void scx_idle_update_selcpu_topology(void) {}
|
||||||
|
static inline void scx_idle_reset_masks(void) {}
|
||||||
|
static inline void scx_idle_init_masks(void) {}
|
||||||
|
static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
|
||||||
|
static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
|
||||||
|
{
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
|
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found);
|
||||||
|
|
||||||
|
extern int scx_idle_init(void);
|
||||||
|
|
||||||
|
#endif /* _KERNEL_SCHED_EXT_IDLE_H */
|
Loading…
Reference in New Issue
Block a user