mirror_ubuntu-kernels/include/linux/sched/ext.h
David Vernet 8a010b81b3 sched_ext: Implement runnable task stall watchdog
The most common and critical way that a BPF scheduler can misbehave is by
failing to run runnable tasks for too long. This patch implements a
watchdog.

* All tasks record when they become runnable.

* A watchdog work periodically scans all runnable tasks. If any task has
  stayed runnable for too long, the BPF scheduler is aborted.

* scheduler_tick() monitors whether the watchdog itself is stuck. If so, the
  BPF scheduler is aborted.

Because the watchdog only scans the tasks which are currently runnable and
usually very infrequently, the overhead should be negligible.
scx_qmap is updated so that it can be told to stall user and/or
kernel tasks.

A detected task stall looks like the following:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s)
    scx_check_timeout_workfn+0x10e/0x1b0
    process_one_work+0x287/0x560
    worker_thread+0x234/0x420
    kthread+0xe9/0x100
    ret_from_fork+0x1f/0x30

A detected watchdog stall:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (watchdog failed to check in for 5.001s)
    scheduler_tick+0x2eb/0x340
    update_process_times+0x7a/0x90
    tick_sched_timer+0xd8/0x130
    __hrtimer_run_queues+0x178/0x3b0
    hrtimer_interrupt+0xfc/0x390
    __sysvec_apic_timer_interrupt+0xb7/0x2b0
    sysvec_apic_timer_interrupt+0x90/0xb0
    asm_sysvec_apic_timer_interrupt+0x1b/0x20
    default_idle+0x14/0x20
    arch_cpu_idle+0xf/0x20
    default_idle_call+0x50/0x90
    do_idle+0xe8/0x240
    cpu_startup_entry+0x1d/0x20
    kernel_init+0x0/0x190
    start_kernel+0x0/0x392
    start_kernel+0x324/0x392
    x86_64_start_reservations+0x2a/0x2c
    x86_64_start_kernel+0x104/0x109
    secondary_startup_64_no_verify+0xce/0xdb

Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to
inline scx_notify_sched_tick().

v4: - While disabling, cancel_delayed_work_sync(&scx_watchdog_work) was
      being called before forward progress was guaranteed and thus could
      lead to system lockup. Relocated.

    - While enabling, it was comparing msecs against jiffies without
      conversion leading to spurious load failures on lower HZ kernels.
      Fixed.

    - runnable list management is now used by core bypass logic and moved to
      the patch implementing sched_ext core.

v3: - bpf_scx_init_member() was incorrectly comparing ops->timeout_ms
      against SCX_WATCHDOG_MAX_TIMEOUT which is in jiffies without
      conversion leading to spurious load failures in lower HZ kernels.
      Fixed.

v2: - Julia Lawall noticed that the watchdog code was mixing msecs and
      jiffies. Fix by using jiffies for everything.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
2024-06-18 10:09:18 -10:00

153 lines
4.6 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#ifndef _LINUX_SCHED_EXT_H
#define _LINUX_SCHED_EXT_H
#ifdef CONFIG_SCHED_CLASS_EXT
#include <linux/llist.h>
#include <linux/rhashtable-types.h>
enum scx_public_consts {
SCX_OPS_NAME_LEN = 128,
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
};
/*
* DSQ (dispatch queue) IDs are 64bit of the format:
*
* Bits: [63] [62 .. 0]
* [ B] [ ID ]
*
* B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
* ID: 63 bit ID
*
* Built-in IDs:
*
* Bits: [63] [62] [61..32] [31 .. 0]
* [ 1] [ L] [ R ] [ V ]
*
* 1: 1 for built-in DSQs.
* L: 1 for LOCAL_ON DSQ IDs, 0 for others
* V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
*/
enum scx_dsq_id_flags {
SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
};
/*
* Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
* scheduler core and the BPF scheduler. See the documentation for more details.
*/
struct scx_dispatch_q {
raw_spinlock_t lock;
struct list_head list; /* tasks in dispatch order */
u32 nr;
u64 id;
struct rhash_head hash_node;
struct llist_node free_node;
struct rcu_head rcu;
};
/* scx_entity.flags */
enum scx_ent_flags {
SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */
SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
SCX_TASK_STATE_BITS = 2,
SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
};
/* scx_entity.flags & SCX_TASK_STATE_MASK */
enum scx_task_state {
SCX_TASK_NONE, /* ops.init_task() not called yet */
SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
SCX_TASK_READY, /* fully initialized, but not in sched_ext */
SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
SCX_TASK_NR_STATES,
};
/*
* Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
* everywhere and the following bits track which kfunc sets are currently
* allowed for %current. This simple per-task tracking works because SCX ops
* nest in a limited way. BPF will likely implement a way to allow and disallow
* kfuncs depending on the calling context which will replace this manual
* mechanism. See scx_kf_allow().
*/
enum scx_kf_mask {
SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */
/* all non-sleepables may be nested inside SLEEPABLE */
SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */
/* ops.dequeue (in REST) may be nested inside DISPATCH */
SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */
SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */
SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */
SCX_KF_REST = 1 << 5, /* other rq-locked operations */
__SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH |
SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
};
/*
* The following is embedded in task_struct and contains all fields necessary
* for a task to be scheduled by SCX.
*/
struct sched_ext_entity {
struct scx_dispatch_q *dsq;
struct list_head dsq_node;
u32 flags; /* protected by rq lock */
u32 weight;
s32 sticky_cpu;
s32 holding_cpu;
u32 kf_mask; /* see scx_kf_mask above */
atomic_long_t ops_state;
struct list_head runnable_node; /* rq->scx.runnable_list */
unsigned long runnable_at;
u64 ddsp_dsq_id;
u64 ddsp_enq_flags;
/* BPF scheduler modifiable fields */
/*
* Runtime budget in nsecs. This is usually set through
* scx_bpf_dispatch() but can also be modified directly by the BPF
* scheduler. Automatically decreased by SCX as the task executes. On
* depletion, a scheduling event is triggered.
*/
u64 slice;
/* cold fields */
/* must be the last field, see init_scx_entity() */
struct list_head tasks_node;
};
void sched_ext_free(struct task_struct *p);
#else /* !CONFIG_SCHED_CLASS_EXT */
static inline void sched_ext_free(struct task_struct *p) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#endif /* _LINUX_SCHED_EXT_H */