mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-12-10 05:11:46 +00:00
If a BPF scheduler triggers an error, the scheduler is aborted and the
system is reverted to the built-in scheduler. In the process, a lot of
information which may be useful for figuring out what happened can be lost.
This patch adds debug dump which captures information which may be useful
for debugging including runqueue and runnable thread states at the time of
failure. The following shows a debug dump after triggering the watchdog:
root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100
stats : enq=1 dsp=0 delta=1 deq=0
stats : enq=90 dsp=90 delta=0 deq=0
stats : enq=156 dsp=156 delta=0 deq=0
stats : enq=218 dsp=218 delta=0 deq=0
stats : enq=255 dsp=255 delta=0 deq=0
stats : enq=271 dsp=271 delta=0 deq=0
stats : enq=284 dsp=284 delta=0 deq=0
stats : enq=293 dsp=293 delta=0 deq=0
DEBUG DUMP
================================================================================
kworker/u32:12[320] triggered exit kind 1026:
runnable task stall (stress[1530] failed to run for 6.841s)
Backtrace:
scx_watchdog_workfn+0x136/0x1c0
process_scheduled_works+0x2b5/0x600
worker_thread+0x269/0x360
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
QMAP FIFO[0]:
QMAP FIFO[1]:
QMAP FIFO[2]: 1436
QMAP FIFO[3]:
QMAP FIFO[4]:
CPU states
----------
CPU 0 : nr_run=1 ops_qseq=244
curr=swapper/0[0] class=idle_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
R stress[1530] -6841ms
scx_state/flags=3/0x1 ops_state/qseq=2/20
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=0
asm_sysvec_apic_timer_interrupt+0x16/0x20
CPU 2 : nr_run=2 ops_qseq=142
curr=swapper/2[0] class=idle_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
R sshd[1703] -5905ms
scx_state/flags=3/0x9 ops_state/qseq=2/88
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=1
__x64_sys_ppoll+0xf6/0x120
do_syscall_64+0x7b/0x150
entry_SYSCALL_64_after_hwframe+0x76/0x7e
R fish[1539] -4141ms
scx_state/flags=3/0x9 ops_state/qseq=2/124
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=1
futex_wait+0x60/0xe0
do_futex+0x109/0x180
__x64_sys_futex+0x117/0x190
do_syscall_64+0x7b/0x150
entry_SYSCALL_64_after_hwframe+0x76/0x7e
CPU 3 : nr_run=2 ops_qseq=162
curr=kworker/u32:12[320] class=ext_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
*R kworker/u32:12[320] +0ms
scx_state/flags=3/0xd ops_state/qseq=0/0
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=0
scx_dump_state+0x613/0x6f0
scx_ops_error_irq_workfn+0x1f/0x40
irq_work_run_list+0x82/0xd0
irq_work_run+0x14/0x30
__sysvec_irq_work+0x40/0x140
sysvec_irq_work+0x60/0x70
asm_sysvec_irq_work+0x16/0x20
scx_watchdog_workfn+0x15f/0x1c0
process_scheduled_works+0x2b5/0x600
worker_thread+0x269/0x360
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
R kworker/3:2[1436] +0ms
scx_state/flags=3/0x9 ops_state/qseq=2/160
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=08
QMAP: force_local=0
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
CPU 7 : nr_run=0 ops_qseq=76
curr=swapper/7[0] class=idle_sched_class
================================================================================
EXIT: runnable task stall (stress[1530] failed to run for 6.841s)
It shows that CPU 3 was running the watchdog when it triggered the error
condition and the scx_qmap thread has been queued on CPU 0 for over 5
seconds but failed to run. It also prints out scx_qmap specific information
- e.g. which tasks are queued on each FIFO and so on using the dump_*() ops.
This dump has proved pretty useful for developing and debugging BPF
schedulers.
Debug dump is generated automatically when the BPF scheduler exits due to an
error. The debug buffer used in such cases is determined by
sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns
the available buffer, the output is truncated and marked accordingly.
Debug dump output can also be read through the sched_ext_dump tracepoint.
When read through the tracepoint, there is no length limit.
SysRq-D can be used to trigger debug dump at any time while a BPF scheduler
is loaded. This is non-destructive - the scheduler keeps running afterwards.
The output can be read through the sched_ext_dump tracepoint.
v2: - The size of exit debug dump buffer can now be customized using
sched_ext_ops.exit_dump_len.
- sched_ext_ops.dump*() added to enable dumping of BPF scheduler
specific information.
- Tracpoint output and SysRq-D triggering added.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
335 lines
7.5 KiB
C
335 lines
7.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* A simple five-level FIFO queue scheduler.
|
|
*
|
|
* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
|
|
* assigned to one depending on its compound weight. Each CPU round robins
|
|
* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
|
|
* queue0, 2 from queue1, 4 from queue2 and so on.
|
|
*
|
|
* This scheduler demonstrates:
|
|
*
|
|
* - BPF-side queueing using PIDs.
|
|
* - Sleepable per-task storage allocation using ops.prep_enable().
|
|
*
|
|
* This scheduler is primarily for demonstration and testing of sched_ext
|
|
* features and unlikely to be useful for actual workloads.
|
|
*
|
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
|
*/
|
|
#include <scx/common.bpf.h>
|
|
|
|
enum consts {
|
|
ONE_SEC_IN_NS = 1000000000,
|
|
SHARED_DSQ = 0,
|
|
};
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
const volatile u64 slice_ns = SCX_SLICE_DFL;
|
|
const volatile u32 stall_user_nth;
|
|
const volatile u32 stall_kernel_nth;
|
|
const volatile u32 dsp_batch;
|
|
const volatile s32 disallow_tgid;
|
|
const volatile bool suppress_dump;
|
|
|
|
u32 test_error_cnt;
|
|
|
|
UEI_DEFINE(uei);
|
|
|
|
struct qmap {
|
|
__uint(type, BPF_MAP_TYPE_QUEUE);
|
|
__uint(max_entries, 4096);
|
|
__type(value, u32);
|
|
} queue0 SEC(".maps"),
|
|
queue1 SEC(".maps"),
|
|
queue2 SEC(".maps"),
|
|
queue3 SEC(".maps"),
|
|
queue4 SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
|
|
__uint(max_entries, 5);
|
|
__type(key, int);
|
|
__array(values, struct qmap);
|
|
} queue_arr SEC(".maps") = {
|
|
.values = {
|
|
[0] = &queue0,
|
|
[1] = &queue1,
|
|
[2] = &queue2,
|
|
[3] = &queue3,
|
|
[4] = &queue4,
|
|
},
|
|
};
|
|
|
|
/* Per-task scheduling context */
|
|
struct task_ctx {
|
|
bool force_local; /* Dispatch directly to local_dsq */
|
|
};
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
|
__uint(map_flags, BPF_F_NO_PREALLOC);
|
|
__type(key, int);
|
|
__type(value, struct task_ctx);
|
|
} task_ctx_stor SEC(".maps");
|
|
|
|
struct cpu_ctx {
|
|
u64 dsp_idx; /* dispatch index */
|
|
u64 dsp_cnt; /* remaining count */
|
|
};
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(max_entries, 1);
|
|
__type(key, u32);
|
|
__type(value, struct cpu_ctx);
|
|
} cpu_ctx_stor SEC(".maps");
|
|
|
|
/* Statistics */
|
|
u64 nr_enqueued, nr_dispatched, nr_dequeued;
|
|
|
|
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
|
|
s32 prev_cpu, u64 wake_flags)
|
|
{
|
|
struct task_ctx *tctx;
|
|
s32 cpu;
|
|
|
|
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
|
|
if (!tctx) {
|
|
scx_bpf_error("task_ctx lookup failed");
|
|
return -ESRCH;
|
|
}
|
|
|
|
if (p->nr_cpus_allowed == 1 ||
|
|
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
|
tctx->force_local = true;
|
|
return prev_cpu;
|
|
}
|
|
|
|
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
|
|
if (cpu >= 0)
|
|
return cpu;
|
|
|
|
return prev_cpu;
|
|
}
|
|
|
|
static int weight_to_idx(u32 weight)
|
|
{
|
|
/* Coarsely map the compound weight to a FIFO. */
|
|
if (weight <= 25)
|
|
return 0;
|
|
else if (weight <= 50)
|
|
return 1;
|
|
else if (weight < 200)
|
|
return 2;
|
|
else if (weight < 400)
|
|
return 3;
|
|
else
|
|
return 4;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
|
|
{
|
|
static u32 user_cnt, kernel_cnt;
|
|
struct task_ctx *tctx;
|
|
u32 pid = p->pid;
|
|
int idx = weight_to_idx(p->scx.weight);
|
|
void *ring;
|
|
|
|
if (p->flags & PF_KTHREAD) {
|
|
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
|
|
return;
|
|
} else {
|
|
if (stall_user_nth && !(++user_cnt % stall_user_nth))
|
|
return;
|
|
}
|
|
|
|
if (test_error_cnt && !--test_error_cnt)
|
|
scx_bpf_error("test triggering error");
|
|
|
|
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
|
|
if (!tctx) {
|
|
scx_bpf_error("task_ctx lookup failed");
|
|
return;
|
|
}
|
|
|
|
/* Is select_cpu() is telling us to enqueue locally? */
|
|
if (tctx->force_local) {
|
|
tctx->force_local = false;
|
|
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
|
|
return;
|
|
}
|
|
|
|
ring = bpf_map_lookup_elem(&queue_arr, &idx);
|
|
if (!ring) {
|
|
scx_bpf_error("failed to find ring %d", idx);
|
|
return;
|
|
}
|
|
|
|
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
|
|
if (bpf_map_push_elem(ring, &pid, 0)) {
|
|
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
|
|
return;
|
|
}
|
|
|
|
__sync_fetch_and_add(&nr_enqueued, 1);
|
|
}
|
|
|
|
/*
|
|
* The BPF queue map doesn't support removal and sched_ext can handle spurious
|
|
* dispatches. qmap_dequeue() is only used to collect statistics.
|
|
*/
|
|
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
|
|
{
|
|
__sync_fetch_and_add(&nr_dequeued, 1);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
|
|
{
|
|
struct task_struct *p;
|
|
struct cpu_ctx *cpuc;
|
|
u32 zero = 0, batch = dsp_batch ?: 1;
|
|
void *fifo;
|
|
s32 i, pid;
|
|
|
|
if (scx_bpf_consume(SHARED_DSQ))
|
|
return;
|
|
|
|
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
|
|
scx_bpf_error("failed to look up cpu_ctx");
|
|
return;
|
|
}
|
|
|
|
for (i = 0; i < 5; i++) {
|
|
/* Advance the dispatch cursor and pick the fifo. */
|
|
if (!cpuc->dsp_cnt) {
|
|
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
|
|
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
|
|
}
|
|
|
|
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
|
|
if (!fifo) {
|
|
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
|
|
return;
|
|
}
|
|
|
|
/* Dispatch or advance. */
|
|
bpf_repeat(BPF_MAX_LOOPS) {
|
|
if (bpf_map_pop_elem(fifo, &pid))
|
|
break;
|
|
|
|
p = bpf_task_from_pid(pid);
|
|
if (!p)
|
|
continue;
|
|
|
|
__sync_fetch_and_add(&nr_dispatched, 1);
|
|
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
|
|
bpf_task_release(p);
|
|
batch--;
|
|
cpuc->dsp_cnt--;
|
|
if (!batch || !scx_bpf_dispatch_nr_slots()) {
|
|
scx_bpf_consume(SHARED_DSQ);
|
|
return;
|
|
}
|
|
if (!cpuc->dsp_cnt)
|
|
break;
|
|
}
|
|
|
|
cpuc->dsp_cnt = 0;
|
|
}
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
|
|
struct scx_init_task_args *args)
|
|
{
|
|
if (p->tgid == disallow_tgid)
|
|
p->scx.disallow = true;
|
|
|
|
/*
|
|
* @p is new. Let's ensure that its task_ctx is available. We can sleep
|
|
* in this function and the following will automatically use GFP_KERNEL.
|
|
*/
|
|
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
|
|
BPF_LOCAL_STORAGE_GET_F_CREATE))
|
|
return 0;
|
|
else
|
|
return -ENOMEM;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
|
|
{
|
|
s32 i, pid;
|
|
|
|
if (suppress_dump)
|
|
return;
|
|
|
|
bpf_for(i, 0, 5) {
|
|
void *fifo;
|
|
|
|
if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
|
|
return;
|
|
|
|
scx_bpf_dump("QMAP FIFO[%d]:", i);
|
|
bpf_repeat(4096) {
|
|
if (bpf_map_pop_elem(fifo, &pid))
|
|
break;
|
|
scx_bpf_dump(" %d", pid);
|
|
}
|
|
scx_bpf_dump("\n");
|
|
}
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
|
|
{
|
|
u32 zero = 0;
|
|
struct cpu_ctx *cpuc;
|
|
|
|
if (suppress_dump || idle)
|
|
return;
|
|
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
|
|
return;
|
|
|
|
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
|
|
cpuc->dsp_idx, cpuc->dsp_cnt);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
|
|
{
|
|
struct task_ctx *taskc;
|
|
|
|
if (suppress_dump)
|
|
return;
|
|
if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
|
|
return;
|
|
|
|
scx_bpf_dump("QMAP: force_local=%d",
|
|
taskc->force_local);
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
|
|
{
|
|
return scx_bpf_create_dsq(SHARED_DSQ, -1);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
|
|
{
|
|
UEI_RECORD(uei, ei);
|
|
}
|
|
|
|
SCX_OPS_DEFINE(qmap_ops,
|
|
.select_cpu = (void *)qmap_select_cpu,
|
|
.enqueue = (void *)qmap_enqueue,
|
|
.dequeue = (void *)qmap_dequeue,
|
|
.dispatch = (void *)qmap_dispatch,
|
|
.init_task = (void *)qmap_init_task,
|
|
.dump = (void *)qmap_dump,
|
|
.dump_cpu = (void *)qmap_dump_cpu,
|
|
.dump_task = (void *)qmap_dump_task,
|
|
.init = (void *)qmap_init,
|
|
.exit = (void *)qmap_exit,
|
|
.timeout_ms = 5000U,
|
|
.name = "qmap");
|