Commit 7b0888b7 authored by Tejun Heo's avatar Tejun Heo

sched_ext: Implement core-sched support

The core-sched support is composed of the following parts:

- task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

- prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

- When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi.

v2: Sched core added the const qualifiers to prio_less task arguments.
    Explicitly drop them for ops.core_sched_before() task arguments. BPF
    enforces access control through the verifier, so the qualifier isn't
    actually operative and only gets in the way when interacting with
    various helpers.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarDavid Vernet <dvernet@meta.com>
Reviewed-by: default avatarJosh Don <joshdon@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
parent 0fd55582
......@@ -129,6 +129,9 @@ struct sched_ext_entity {
struct list_head runnable_node; /* rq->scx.runnable_list */
unsigned long runnable_at;
#ifdef CONFIG_SCHED_CORE
u64 core_sched_at; /* see scx_prio_less() */
#endif
u64 ddsp_dsq_id;
u64 ddsp_enq_flags;
......
......@@ -135,7 +135,7 @@ config SCHED_CORE
config SCHED_CLASS_EXT
bool "Extensible Scheduling Class"
depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
depends on BPF_SYSCALL && BPF_JIT
help
This option enables a new scheduler class sched_ext (SCX), which
allows scheduling policies to be implemented as BPF programs to
......
......@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
if (task_on_scx(p))
return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
}
/*
......@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a,
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
#ifdef CONFIG_SCHED_CLASS_EXT
if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
return scx_prio_less(a, b, in_fi);
#endif
return false;
}
......
This diff is collapsed.
......@@ -70,6 +70,11 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
for_active_class_range(class, (prev_class) > &ext_sched_class ? \
&ext_sched_class : (prev_class), (end_class))
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
#endif
#else /* CONFIG_SCHED_CLASS_EXT */
#define scx_enabled() false
......
......@@ -13,6 +13,7 @@
* - Sleepable per-task storage allocation using ops.prep_enable().
* - Using ops.cpu_release() to handle a higher priority scheduling class taking
* the CPU away.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
* features and unlikely to be useful for actual workloads.
......@@ -67,9 +68,21 @@ struct {
},
};
/*
* Per-queue sequence numbers to implement core-sched ordering.
*
* Tail seq is assigned to each queued task and incremented. Head seq tracks the
* sequence number of the latest dispatched task. The distance between the a
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
static u64 core_sched_head_seqs[5];
static u64 core_sched_tail_seqs[5];
/* Per-task scheduling context */
struct task_ctx {
bool force_local; /* Dispatch directly to local_dsq */
u64 core_sched_seq;
};
struct {
......@@ -93,6 +106,7 @@ struct {
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
u64 nr_core_sched_execed;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
......@@ -159,8 +173,18 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
/* Is select_cpu() is telling us to enqueue locally? */
if (tctx->force_local) {
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
* qmap_ops.flags.
*/
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
/*
* If qmap_select_cpu() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
tctx->force_local = false;
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
......@@ -204,6 +228,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
__sync_fetch_and_add(&nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
__sync_fetch_and_add(&nr_core_sched_execed, 1);
}
static void update_core_sched_head_seq(struct task_struct *p)
{
struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
int idx = weight_to_idx(p->scx.weight);
if (tctx)
core_sched_head_seqs[idx] = tctx->core_sched_seq;
else
scx_bpf_error("task_ctx lookup failed");
}
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
......@@ -258,6 +295,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (!p)
continue;
update_core_sched_head_seq(p);
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
bpf_task_release(p);
......@@ -275,6 +313,49 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
/*
* The distance from the head of the queue scaled by the weight of the queue.
* The lower the number, the older the task and the higher the priority.
*/
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
struct task_ctx *tctx;
s64 qdist;
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return 0;
}
qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
* is dispatched twice more frequently than 2. Reflect the difference by
* scaling qdists accordingly. Note that the shift amount needs to be
* flipped depending on the sign to avoid flipping priority direction.
*/
if (qdist >= 0)
return qdist << (4 - idx);
else
return qdist << idx;
}
/*
* This is called to determine the task ordering when core-sched is picking
* tasks to execute on SMT siblings and should encode about the same ordering as
* the regular scheduling path. Use the priority-scaled distances from the head
* of the queues to compare the two tasks which should be consistent with the
* dispatch path behavior.
*/
bool BPF_STRUCT_OPS(qmap_core_sched_before,
struct task_struct *a, struct task_struct *b)
{
return task_qdist(a) > task_qdist(b);
}
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
{
u32 cnt;
......@@ -354,8 +435,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
return;
scx_bpf_dump("QMAP: force_local=%d",
taskc->force_local);
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
taskc->force_local, taskc->core_sched_seq);
}
/*
......@@ -428,6 +509,7 @@ SCX_OPS_DEFINE(qmap_ops,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.core_sched_before = (void *)qmap_core_sched_before,
.cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task,
.dump = (void *)qmap_dump,
......@@ -437,5 +519,6 @@ SCX_OPS_DEFINE(qmap_ops,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.flags = SCX_OPS_ENQ_LAST,
.timeout_ms = 5000U,
.name = "qmap");
......@@ -112,9 +112,10 @@ int main(int argc, char **argv)
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n",
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
skel->bss->nr_core_sched_execed);
fflush(stdout);
sleep(1);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment