Commit 245254f7 authored by David Vernet's avatar David Vernet Committed by Tejun Heo

sched_ext: Implement sched_ext_ops.cpu_acquire/release()

Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing.

v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
    access control through the verifier, so the qualifier isn't actually
    operative and only gets in the way when interacting with various
    helpers.

v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local()
    from ops.cpu_release() nested inside ops.init() and other sleepable
    operations.
Signed-off-by: default avatarDavid Vernet <dvernet@meta.com>
Reviewed-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarJosh Don <joshdon@google.com>
Acked-by: default avatarHao Luo <haoluo@google.com>
Acked-by: default avatarBarret Rhoden <brho@google.com>
parent 90e55164
......@@ -98,13 +98,15 @@ enum scx_kf_mask {
SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */
/* all non-sleepables may be nested inside SLEEPABLE */
SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */
/* ops.dequeue (in REST) may be nested inside DISPATCH */
SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */
SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */
SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */
SCX_KF_REST = 1 << 5, /* other rq-locked operations */
__SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH |
__SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
__SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
};
......
This diff is collapsed.
......@@ -24,6 +24,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static inline bool task_on_scx(const struct task_struct *p)
{
return scx_enabled() && p->sched_class == &ext_sched_class;
......
......@@ -737,6 +737,7 @@ struct scx_rq {
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
u32 flags;
bool cpu_released;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
......
......@@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
......
......@@ -11,6 +11,8 @@
*
* - BPF-side queueing using PIDs.
* - Sleepable per-task storage allocation using ops.prep_enable().
* - Using ops.cpu_release() to handle a higher priority scheduling class taking
* the CPU away.
*
* This scheduler is primarily for demonstration and testing of sched_ext
* features and unlikely to be useful for actual workloads.
......@@ -90,7 +92,7 @@ struct {
} cpu_ctx_stor SEC(".maps");
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_dequeued;
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
......@@ -164,6 +166,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
/*
* If the task was re-enqueued due to the CPU being preempted by a
* higher priority scheduling class, just re-enqueue the task directly
* on the global DSQ. As we want another CPU to pick it up, find and
* kick an idle CPU.
*/
if (enq_flags & SCX_ENQ_REENQ) {
s32 cpu;
scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
return;
}
ring = bpf_map_lookup_elem(&queue_arr, &idx);
if (!ring) {
scx_bpf_error("failed to find ring %d", idx);
......@@ -257,6 +275,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
{
u32 cnt;
/*
* Called when @cpu is taken by a higher priority scheduling class. This
* makes @cpu no longer available for executing sched_ext tasks. As we
* don't want the tasks in @cpu's local dsq to sit there until @cpu
* becomes available again, re-enqueue them into the global dsq. See
* %SCX_ENQ_REENQ handling in qmap_enqueue().
*/
cnt = scx_bpf_reenqueue_local();
if (cnt)
__sync_fetch_and_add(&nr_reenqueued, cnt);
}
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
......@@ -339,6 +373,7 @@ SCX_OPS_DEFINE(qmap_ops,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task,
.dump = (void *)qmap_dump,
.dump_cpu = (void *)qmap_dump_cpu,
......
......@@ -112,9 +112,9 @@ int main(int argc, char **argv)
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("stats : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n",
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_dequeued);
skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
fflush(stdout);
sleep(1);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment