Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Implement the SCHED_DEADLINE server infrastructure - Daniel Bristot
   de Oliveira's last major contribution to the kernel:

     "SCHED_DEADLINE servers can help fixing starvation issues of low
      priority tasks (e.g., SCHED_OTHER) when higher priority tasks
      monopolize CPU cycles. Today we have RT Throttling; DEADLINE
      servers should be able to replace and improve that."

   (Daniel Bristot de Oliveira, Peter Zijlstra, Joel Fernandes, Youssef
   Esmat, Huang Shijie)

 - Preparatory changes for sched_ext integration:
     - Use set_next_task(.first) where required
     - Fix up set_next_task() implementations
     - Clean up DL server vs. core sched
     - Split up put_prev_task_balance()
     - Rework pick_next_task()
     - Combine the last put_prev_task() and the first set_next_task()
     - Rework dl_server
     - Add put_prev_task(.next)

   (Peter Zijlstra, with a fix by Tejun Heo)

 - Complete the EEVDF transition and refine EEVDF scheduling:
     - Implement delayed dequeue
     - Allow shorter slices to wakeup-preempt
     - Use sched_attr::sched_runtime to set request/slice suggestion
     - Document the new feature flags
     - Remove unused and duplicate-functionality fields
     - Simplify & unify pick_next_task_fair()
     - Misc debuggability enhancements

   (Peter Zijlstra, with fixes/cleanups by Dietmar Eggemann, Valentin
   Schneider and Chuyi Zhou)

 - Initialize the vruntime of a new task when it is first enqueued,
   resulting in significant decrease in latency of newly woken tasks
   (Zhang Qiao)

 - Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
   (K Prateek Nayak, Peter Zijlstra)

 - Clean up and clarify the usage of Clean up usage of rt_task()
   (Qais Yousef)

 - Preempt SCHED_IDLE entities in strict cgroup hierarchies
   (Tianchen Ding)

 - Clarify the documentation of time units for deadline scheduler
   parameters (Christian Loehle)

 - Remove the HZ_BW chicken-bit feature flag introduced a year ago,
   the original change seems to be working fine (Phil Auld)

 - Misc fixes and cleanups (Chen Yu, Dan Carpenter, Huang Shijie,
   Peilin He, Qais Yousefm and Vincent Guittot)

* tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/cpufreq: Use NSEC_PER_MSEC for deadline task
  cpufreq/cppc: Use NSEC_PER_MSEC for deadline task
  sched/deadline: Clarify nanoseconds in uapi
  sched/deadline: Convert schedtool example to chrt
  sched/debug: Fix the runnable tasks output
  sched: Fix sched_delayed vs sched_core
  kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
  kthread: Fix task state in kthread worker if being frozen
  sched/pelt: Use rq_clock_task() for hw_pressure
  sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c
  sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
  sched: Add put_prev_task(.next)
  sched: Rework dl_server
  sched: Combine the last put_prev_task() and the first set_next_task()
  sched: Rework pick_next_task()
  sched: Split up put_prev_task_balance()
  sched: Clean up DL server vs core sched
  sched: Fixup set_next_task() implementations
  sched: Use set_next_task(.first) where required
  sched/fair: Properly deactivate sched_delayed task upon class change
  ...
This commit is contained in:
Linus Torvalds
2024-09-19 15:55:58 +02:00
32 changed files with 1695 additions and 747 deletions

View File

@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task_is_realtime(task))
else if (rt_or_dl_task_policy(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;

View File

@@ -149,8 +149,9 @@ struct user_event_mm;
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
@@ -541,9 +542,14 @@ struct sched_entity {
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
u64 min_slice;
struct list_head group_node;
unsigned int on_rq;
unsigned char on_rq;
unsigned char sched_delayed;
unsigned char rel_deadline;
unsigned char custom_slice;
/* hole */
u64 exec_start;
u64 sum_exec_runtime;
@@ -639,12 +645,26 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*
* @dl_server tells if this is a server entity.
*
* @dl_defer tells if this is a deferred or regular server. For
* now only defer server exists.
*
* @dl_defer_armed tells if the deferrable server is waiting
* for the replenishment timer to activate it.
*
* @dl_defer_running tells if the deferrable server is actually
* running, skipping the defer phase.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -672,7 +692,7 @@ struct sched_dl_entity {
*/
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*

View File

@@ -10,16 +10,16 @@
#include <linux/sched.h>
#define MAX_DL_PRIO 0
static inline int dl_prio(int prio)
static inline bool dl_prio(int prio)
{
if (unlikely(prio < MAX_DL_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_DL_PRIO);
}
static inline int dl_task(struct task_struct *p)
/*
* Returns true if a task has a priority that belongs to DL class. PI-boosted
* tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
*/
static inline bool dl_task(struct task_struct *p)
{
return dl_prio(p->prio);
}

View File

@@ -14,6 +14,7 @@
*/
#define MAX_RT_PRIO 100
#define MAX_DL_PRIO 0
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)

View File

@@ -6,19 +6,40 @@
struct task_struct;
static inline int rt_prio(int prio)
static inline bool rt_prio(int prio)
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
}
static inline int rt_task(struct task_struct *p)
static inline bool rt_or_dl_prio(int prio)
{
return unlikely(prio < MAX_RT_PRIO);
}
/*
* Returns true if a task has a priority that belongs to RT class. PI-boosted
* tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
*/
static inline bool rt_task(struct task_struct *p)
{
return rt_prio(p->prio);
}
static inline bool task_is_realtime(struct task_struct *tsk)
/*
* Returns true if a task has a priority that belongs to RT or DL classes.
* PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
* PI-boosted tasks.
*/
static inline bool rt_or_dl_task(struct task_struct *p)
{
return rt_or_dl_prio(p->prio);
}
/*
* Returns true if a task has a policy that belongs to RT or DL classes.
* PI-boosted tasks will return false.
*/
static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
{
int policy = tsk->policy;