/* SPDX-License-Identifier: GPL-2.0 */ /* * A simple five-level FIFO queue scheduler. * * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets * assigned to one depending on its compound weight. Each CPU round robins * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from * queue0, 2 from queue1, 4 from queue2 and so on. * * This scheduler demonstrates: * * - BPF-side queueing using PIDs. * - Sleepable per-task storage allocation using ops.prep_enable(). * * This scheduler is primarily for demonstration and testing of sched_ext * features and unlikely to be useful for actual workloads. * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet */ #include enum consts { ONE_SEC_IN_NS = 1000000000, SHARED_DSQ = 0, }; char _license[] SEC("license") = "GPL"; const volatile u64 slice_ns = SCX_SLICE_DFL; const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_batch; const volatile s32 disallow_tgid; const volatile bool suppress_dump; u32 test_error_cnt; UEI_DEFINE(uei); struct qmap { __uint(type, BPF_MAP_TYPE_QUEUE); __uint(max_entries, 4096); __type(value, u32); } queue0 SEC(".maps"), queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), queue4 SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); __uint(max_entries, 5); __type(key, int); __array(values, struct qmap); } queue_arr SEC(".maps") = { .values = { [0] = &queue0, [1] = &queue1, [2] = &queue2, [3] = &queue3, [4] = &queue4, }, }; /* Per-task scheduling context */ struct task_ctx { bool force_local; /* Dispatch directly to local_dsq */ }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct task_ctx); } task_ctx_stor SEC(".maps"); struct cpu_ctx { u64 dsp_idx; /* dispatch index */ u64 dsp_cnt; /* remaining count */ }; struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 1); __type(key, u32); __type(value, struct cpu_ctx); } cpu_ctx_stor SEC(".maps"); /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_dequeued; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct task_ctx *tctx; s32 cpu; tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); if (!tctx) { scx_bpf_error("task_ctx lookup failed"); return -ESRCH; } if (p->nr_cpus_allowed == 1 || scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { tctx->force_local = true; return prev_cpu; } cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); if (cpu >= 0) return cpu; return prev_cpu; } static int weight_to_idx(u32 weight) { /* Coarsely map the compound weight to a FIFO. */ if (weight <= 25) return 0; else if (weight <= 50) return 1; else if (weight < 200) return 2; else if (weight < 400) return 3; else return 4; } void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) { static u32 user_cnt, kernel_cnt; struct task_ctx *tctx; u32 pid = p->pid; int idx = weight_to_idx(p->scx.weight); void *ring; if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return; } else { if (stall_user_nth && !(++user_cnt % stall_user_nth)) return; } if (test_error_cnt && !--test_error_cnt) scx_bpf_error("test triggering error"); tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); if (!tctx) { scx_bpf_error("task_ctx lookup failed"); return; } /* Is select_cpu() is telling us to enqueue locally? */ if (tctx->force_local) { tctx->force_local = false; scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; } ring = bpf_map_lookup_elem(&queue_arr, &idx); if (!ring) { scx_bpf_error("failed to find ring %d", idx); return; } /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ if (bpf_map_push_elem(ring, &pid, 0)) { scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); return; } __sync_fetch_and_add(&nr_enqueued, 1); } /* * The BPF queue map doesn't support removal and sched_ext can handle spurious * dispatches. qmap_dequeue() is only used to collect statistics. */ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) { __sync_fetch_and_add(&nr_dequeued, 1); } void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) { struct task_struct *p; struct cpu_ctx *cpuc; u32 zero = 0, batch = dsp_batch ?: 1; void *fifo; s32 i, pid; if (scx_bpf_consume(SHARED_DSQ)) return; if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { scx_bpf_error("failed to look up cpu_ctx"); return; } for (i = 0; i < 5; i++) { /* Advance the dispatch cursor and pick the fifo. */ if (!cpuc->dsp_cnt) { cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; cpuc->dsp_cnt = 1 << cpuc->dsp_idx; } fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); if (!fifo) { scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); return; } /* Dispatch or advance. */ bpf_repeat(BPF_MAX_LOOPS) { if (bpf_map_pop_elem(fifo, &pid)) break; p = bpf_task_from_pid(pid); if (!p) continue; __sync_fetch_and_add(&nr_dispatched, 1); scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); bpf_task_release(p); batch--; cpuc->dsp_cnt--; if (!batch || !scx_bpf_dispatch_nr_slots()) { scx_bpf_consume(SHARED_DSQ); return; } if (!cpuc->dsp_cnt) break; } cpuc->dsp_cnt = 0; } } s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, struct scx_init_task_args *args) { if (p->tgid == disallow_tgid) p->scx.disallow = true; /* * @p is new. Let's ensure that its task_ctx is available. We can sleep * in this function and the following will automatically use GFP_KERNEL. */ if (bpf_task_storage_get(&task_ctx_stor, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE)) return 0; else return -ENOMEM; } void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) { s32 i, pid; if (suppress_dump) return; bpf_for(i, 0, 5) { void *fifo; if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) return; scx_bpf_dump("QMAP FIFO[%d]:", i); bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; scx_bpf_dump(" %d", pid); } scx_bpf_dump("\n"); } } void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) { u32 zero = 0; struct cpu_ctx *cpuc; if (suppress_dump || idle) return; if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) return; scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", cpuc->dsp_idx, cpuc->dsp_cnt); } void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) { struct task_ctx *taskc; if (suppress_dump) return; if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) return; scx_bpf_dump("QMAP: force_local=%d", taskc->force_local); } s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { return scx_bpf_create_dsq(SHARED_DSQ, -1); } void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) { UEI_RECORD(uei, ei); } SCX_OPS_DEFINE(qmap_ops, .select_cpu = (void *)qmap_select_cpu, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, .init_task = (void *)qmap_init_task, .dump = (void *)qmap_dump, .dump_cpu = (void *)qmap_dump_cpu, .dump_task = (void *)qmap_dump_task, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, .name = "qmap");