Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1

Pull sched/core to resolve conflicts between:

  c2a57380df ("sched: Replace use of system_unbound_wq with system_dfl_wq")

from the tip tree and commit:

  cde94c032b ("sched_ext: Make watchdog sub-sched aware")

The latter moves around code modiefied by the former. Apply the changes in
the new locations.

Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Tejun Heo
2026-03-09 09:59:36 -10:00
15 changed files with 363 additions and 112 deletions

View File

@@ -136,9 +136,6 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
}
#endif
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
extern void mm_init_global_asid(struct mm_struct *mm);
extern void mm_free_global_asid(struct mm_struct *mm);

View File

@@ -172,6 +172,28 @@ struct tlb_state_shared {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
/*
* Please ignore the name of this function. It should be called
* switch_to_kernel_thread().
*
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
* kernel thread or other context without an mm. Acceptable implementations
* include doing nothing whatsoever, switching to init_mm, or various clever
* lazy tricks to try to minimize TLB flushes.
*
* The scheduler reserves the right to call enter_lazy_tlb() several times
* in a row. It will notify us that we're going back to a real mm by
* calling switch_mm_irqs_off().
*/
#define enter_lazy_tlb enter_lazy_tlb
static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
bool nmi_uaccess_okay(void);
#define nmi_uaccess_okay nmi_uaccess_okay
@@ -480,6 +502,10 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif
#else /* !MODULE */
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
__compiletime_error("enter_lazy_tlb() should not be used in modules");
#endif /* !MODULE */
static inline void __native_tlb_flush_global(unsigned long cr4)

View File

@@ -971,27 +971,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
}
}
/*
* Please ignore the name of this function. It should be called
* switch_to_kernel_thread().
*
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
* kernel thread or other context without an mm. Acceptable implementations
* include doing nothing whatsoever, switching to init_mm, or various clever
* lazy tricks to try to minimize TLB flushes.
*
* The scheduler reserves the right to call enter_lazy_tlb() several times
* in a row. It will notify us that we're going back to a real mm by
* calling switch_mm_irqs_off().
*/
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes

View File

@@ -95,6 +95,7 @@ struct sched_domain {
unsigned int newidle_call;
unsigned int newidle_success;
unsigned int newidle_ratio;
u64 newidle_stamp;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;

View File

@@ -146,4 +146,7 @@ struct clone_args {
SCHED_FLAG_KEEP_ALL | \
SCHED_FLAG_UTIL_CLAMP)
/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */
#define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01
#endif /* _UAPI_LINUX_SCHED_H */

View File

@@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
}
}
void raw_spin_rq_unlock(struct rq *rq)
{
raw_spin_unlock(rq_lockp(rq));
}
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -5678,7 +5673,7 @@ static void sched_tick_remote(struct work_struct *work)
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
queue_delayed_work(system_unbound_wq, dwork, HZ);
queue_delayed_work(system_dfl_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
@@ -5697,7 +5692,7 @@ static void sched_tick_start(int cpu)
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
queue_delayed_work(system_dfl_wq, &twork->work, HZ);
}
}

View File

@@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
int flags)
{
struct task_struct *p = dl_task_of(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
if (!schedstat_enabled())
return;
if (p != rq->curr)
update_stats_wait_end_dl(dl_rq, dl_se);
if ((flags & DEQUEUE_SLEEP)) {
unsigned int state;
@@ -3613,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
struct sched_dl_entity *dl_se = &p->dl;
struct rq *rq = task_rq(p);
u64 adj_deadline;
attr->sched_priority = p->rt_priority;
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
guard(raw_spinlock_irq)(&rq->__lock);
update_rq_clock(rq);
if (task_current(rq, p))
update_curr_dl(rq);
attr->sched_runtime = dl_se->runtime;
adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
attr->sched_deadline = adj_deadline;
} else {
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
}
attr->sched_period = dl_se->dl_period;
attr->sched_flags &= ~SCHED_DL_FLAGS;
attr->sched_flags |= dl_se->flags;

View File

@@ -8,6 +8,7 @@
*/
#include <linux/debugfs.h>
#include <linux/nmi.h>
#include <linux/log2.h>
#include "sched.h"
/*
@@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
s64 zero_vruntime = -1, sum_w_vruntime = -1;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned int sum_shift;
unsigned long flags;
u64 sum_weight;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
@@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
right_vruntime = last->vruntime;
zero_vruntime = cfs_rq->zero_vruntime;
sum_w_vruntime = cfs_rq->sum_w_vruntime;
sum_weight = cfs_rq->sum_weight;
sum_shift = cfs_rq->sum_shift;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
sum_weight);
SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",

View File

@@ -3208,8 +3208,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
intv = READ_ONCE(scx_watchdog_interval);
if (intv < ULONG_MAX)
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
intv);
queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
}
void scx_tick(struct rq *rq)
@@ -5233,7 +5232,7 @@ static void refresh_watchdog(void)
WRITE_ONCE(scx_watchdog_interval, intv);
if (intv < ULONG_MAX)
mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
else
cancel_delayed_work_sync(&scx_watchdog_work);
}

View File

@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
update_sysctl();
}
#ifndef CONFIG_64BIT
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
#else
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
return (delta_exec * weight) / lw->weight;
}
#endif
/*
* delta /= w
@@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
{
#ifdef CONFIG_64BIT
if (cfs_rq->sum_shift)
w = max(2UL, w >> cfs_rq->sum_shift);
#endif
return w;
}
static inline void
__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
s64 w_vruntime, key = entity_key(cfs_rq, se);
w_vruntime = key * weight;
WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
cfs_rq->sum_w_vruntime += w_vruntime;
cfs_rq->sum_weight += weight;
}
static void
sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight;
s64 key, tmp;
again:
weight = avg_vruntime_weight(cfs_rq, se->load.weight);
key = entity_key(cfs_rq, se);
if (check_mul_overflow(key, weight, &key))
goto overflow;
if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
goto overflow;
cfs_rq->sum_w_vruntime = tmp;
cfs_rq->sum_weight += weight;
return;
overflow:
/*
* There's gotta be a limit -- if we're still failing at this point
* there's really nothing much to be done about things.
*/
BUG_ON(cfs_rq->sum_shift >= 10);
cfs_rq->sum_shift++;
/*
* Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
*/
cfs_rq->sum_w_vruntime = 0;
cfs_rq->sum_weight = 0;
for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
node; node = rb_next(node))
__sum_w_vruntime_add(cfs_rq, __node_2_se(node));
goto again;
}
static void
sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
if (sched_feat(PARANOID_AVG))
return sum_w_vruntime_add_paranoid(cfs_rq, se);
cfs_rq->sum_w_vruntime += key * weight;
cfs_rq->sum_weight += weight;
__sum_w_vruntime_add(cfs_rq, se);
}
static void
sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
s64 key = entity_key(cfs_rq, se);
cfs_rq->sum_w_vruntime -= key * weight;
@@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
s64 runtime = cfs_rq->sum_w_vruntime;
if (curr) {
unsigned long w = scale_load_down(curr->load.weight);
unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
runtime += entity_key(cfs_rq, curr) * w;
weight += w;
@@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
if (runtime < 0)
runtime -= (weight - 1);
delta = div_s64(runtime, weight);
delta = div64_long(runtime, weight);
} else if (curr) {
/*
* When there is but one element, it is the average.
@@ -764,17 +829,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
*
* -r_max < lag < max(r_max, q)
*/
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
{
u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
vlag = avruntime - se->vruntime;
limit = calc_delta_fair(max_slice, se);
se->vlag = clamp(vlag, -limit, limit);
return clamp(vlag, -limit, limit);
}
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
WARN_ON_ONCE(!se->on_rq);
se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
}
/*
@@ -801,7 +871,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
@@ -3840,23 +3910,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
se_weight(se) * -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
static void
rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
{
unsigned long old_weight = se->load.weight;
/*
* VRUNTIME
* --------
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
*
* Proof: For contradiction assume this is not true, so we can
* re-weight without changing vruntime at !0-lag point.
*
* Weight VRuntime Avg-VRuntime
* before w v V
* after w' v' V'
*
* Since lag needs to be preserved through re-weight:
*
* lag = (V - v)*w = (V'- v')*w', where v = v'
* ==> V' = (V - v)*w/w' + v (1)
*
* Let W be the total weight of the entities before reweight,
* since V' is the new weighted average of entities:
*
* V' = (WV + w'v - wv) / (W + w' - w) (2)
*
* by using (1) & (2) we obtain:
*
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
*
* Since we are doing at !0-lag point which means V != v, we
* can simplify (3):
*
* ==> W / (W + w' - w) = w / w'
* ==> Ww' = Ww + ww' - ww
* ==> W * (w' - w) = w * (w' - w)
* ==> W = w (re-weight indicates w' != w)
*
* So the cfs_rq contains only one entity, hence vruntime of
* the entity @v should always equal to the cfs_rq's weighted
* average vruntime @V, which means we will always re-weight
* at 0-lag point, thus breach assumption. Proof completed.
*
*
* COROLLARY #2: Re-weight does NOT affect weighted average
* vruntime of all the entities.
*
* Proof: According to corollary #1, Eq. (1) should be:
*
* (V - v)*w = (V' - v')*w'
* ==> v' = V' - (V - v)*w/w' (4)
*
* According to the weighted average formula, we have:
*
* V' = (WV - wv + w'v') / (W - w + w')
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
* = (WV + w'V' - Vw) / (W - w + w')
*
* ==> V'*(W - w + w') = WV + w'V' - Vw
* ==> V' * (W - w) = (W - w) * V (5)
*
* If the entity is the only one in the cfs_rq, then reweight
* always occurs at 0-lag point, so V won't change. Or else
* there are other entities, hence W != w, then Eq. (5) turns
* into V' = V. So V won't change in either case, proof done.
*
*
* So according to corollary #1 & #2, the effect of re-weight
* on vruntime should be:
*
* v' = V' - (V - v) * w / w' (4)
* = V - (V - v) * w / w'
* = V - vl * w / w'
* = V - vl'
*/
se->vlag = div64_long(se->vlag * old_weight, weight);
/*
* DEADLINE
* --------
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
*
* d' = v' + (d - v)*w/w'
* = V' - (V - v)*w/w' + (d - v)*w/w'
* = V - (V - v)*w/w' + (d - v)*w/w'
* = V + (d - V)*w/w'
*/
if (se->rel_deadline)
se->deadline = div64_long(se->deadline * old_weight, weight);
if (rel_vprot)
se->vprot = div64_long(se->vprot * old_weight, weight);
}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
bool rel_vprot = false;
u64 vprot;
u64 avruntime = 0;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
avruntime = avg_vruntime(cfs_rq);
se->vlag = entity_lag(cfs_rq, se, avruntime);
se->deadline -= avruntime;
se->rel_deadline = 1;
if (curr && protect_slice(se)) {
vprot = se->vprot - se->vruntime;
se->vprot -= avruntime;
rel_vprot = true;
}
@@ -3867,30 +4039,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
dequeue_load_avg(cfs_rq, se);
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * se->load.weight, weight);
if (se->rel_deadline)
se->deadline = div_s64(se->deadline * se->load.weight, weight);
if (rel_vprot)
vprot = div_s64(vprot * se->load.weight, weight);
rescale_entity(se, weight, rel_vprot);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
place_entity(cfs_rq, se, 0);
if (rel_vprot)
se->vprot = se->vruntime + vprot;
se->vprot += avruntime;
se->deadline += avruntime;
se->rel_deadline = 0;
se->vruntime = avruntime - se->vlag;
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5180,7 +5345,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
long load;
lag = se->vlag;
@@ -5238,17 +5403,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
load += avg_vruntime_weight(cfs_rq, curr->load.weight);
lag *= load + scale_load_down(se->load.weight);
lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
lag = div64_long(lag, load);
}
se->vruntime = vruntime - lag;
if (se->rel_deadline) {
if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -6853,16 +7018,15 @@ static inline void hrtick_update(struct rq *rq)
static inline bool cpu_overutilized(int cpu)
{
unsigned long rq_util_min, rq_util_max;
unsigned long rq_util_max;
if (!sched_energy_enabled())
return false;
rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
}
/*
@@ -6900,9 +7064,15 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
static int sched_idle_cpu(int cpu)
static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
{
return sched_idle_rq(cpu_rq(cpu));
return sched_idle_rq(rq) && !task_has_idle_policy(p);
}
static int choose_idle_cpu(int cpu, struct task_struct *p)
{
return available_idle_cpu(cpu) ||
choose_sched_idle_rq(cpu_rq(cpu), p);
}
static void
@@ -7467,7 +7637,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
if (sched_idle_cpu(i))
if (choose_sched_idle_rq(rq, p))
return i;
if (available_idle_cpu(i)) {
@@ -7558,8 +7728,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
sched_cpu_cookie_match(cpu_rq(cpu), p))
if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7632,7 +7801,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7667,7 +7837,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7789,7 +7959,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7860,7 +8030,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7868,7 +8038,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7900,7 +8070,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -10047,6 +10217,7 @@ struct sg_lb_stats {
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -10279,6 +10450,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
/*
* With EAS and uclamp, 1 CPU in the group must be overutilized to
* consider the group overloaded.
*/
if (sched_energy_enabled() && !sgs->group_overutilized)
return false;
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
@@ -10462,14 +10640,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_overloaded: sched_group is overloaded
* @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
bool *sg_overloaded,
bool *sg_overutilized)
bool *sg_overloaded)
{
int i, nr_running, local_group, sd_flags = env->sd->flags;
bool balancing_at_rd = !env->sd->parent;
@@ -10491,7 +10667,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (cpu_overutilized(i))
*sg_overutilized = 1;
sgs->group_overutilized = 1;
/*
* No need to call idle_cpu() if nr_running is not 0
@@ -11162,13 +11338,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
sg_overutilized |= sgs->group_overutilized;
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -12289,7 +12467,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
sd->newidle_success += success;
if (sd->newidle_call >= 1024) {
sd->newidle_ratio = sd->newidle_success;
u64 now = sched_clock();
s64 delta = now - sd->newidle_stamp;
sd->newidle_stamp = now;
int ratio = 0;
if (delta < 0)
delta = 0;
if (sched_feat(NI_RATE)) {
/*
* ratio delta freq
*
* 1024 - 4 s - 128 Hz
* 512 - 2 s - 256 Hz
* 256 - 1 s - 512 Hz
* 128 - .5 s - 1024 Hz
* 64 - .25 s - 2048 Hz
*/
ratio = delta >> 22;
}
ratio += sd->newidle_success;
sd->newidle_ratio = min(1024, ratio);
sd->newidle_call /= 2;
sd->newidle_success /= 2;
}
@@ -12336,7 +12537,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12374,7 +12575,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
busy = !idle && !sched_idle_cpu(cpu);
busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
@@ -12996,7 +13197,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
unsigned int weight = 1;
if (sched_feat(NI_RANDOM)) {
if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
/*
* Throw a 1k sided dice; and only run
* newidle_balance according to the success

View File

@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(DELAY_DEQUEUE, true)
SCHED_FEAT(DELAY_ZERO, true)
SCHED_FEAT(PARANOID_AVG, false)
/*
* Allow wakeup-time preemption of the current task:
*/
@@ -126,3 +128,4 @@ SCHED_FEAT(LATENCY_WARN, false)
* Do newidle balancing proportional to its success rate using randomization.
*/
SCHED_FEAT(NI_RANDOM, true)
SCHED_FEAT(NI_RATE, true)

View File

@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int flags)
{
struct task_struct *p = NULL;
struct rq *rq = rq_of_rt_rq(rt_rq);
if (!schedstat_enabled())
return;
if (rt_entity_is_task(rt_se))
if (rt_entity_is_task(rt_se)) {
p = rt_task_of(rt_se);
if (p != rq->curr)
update_stats_wait_end_rt(rt_rq, rt_se);
}
if ((flags & DEQUEUE_SLEEP) && p) {
unsigned int state;

View File

@@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
@@ -684,8 +684,9 @@ struct cfs_rq {
s64 sum_w_vruntime;
u64 sum_weight;
u64 zero_vruntime;
unsigned int sum_shift;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 zero_vruntime_fi;
@@ -1609,15 +1610,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
extern bool raw_spin_rq_trylock(struct rq *rq)
__cond_acquires(true, __rq_lockp(rq));
extern void raw_spin_rq_unlock(struct rq *rq)
__releases(__rq_lockp(rq));
static inline void raw_spin_rq_lock(struct rq *rq)
__acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
static inline void raw_spin_rq_unlock(struct rq *rq)
__releases(__rq_lockp(rq))
{
raw_spin_unlock(rq_lockp(rq));
}
static inline void raw_spin_rq_lock_irq(struct rq *rq)
__acquires(__rq_lockp(rq))
{

View File

@@ -881,10 +881,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
return -E2BIG;
}
static void get_params(struct task_struct *p, struct sched_attr *attr)
static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
__getparam_dl(p, attr, flags);
} else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
} else {
@@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
return -ESRCH;
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
get_params(p, &attr);
get_params(p, &attr, 0);
return sched_setattr(p, &attr);
}
@@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
usize < SCHED_ATTR_SIZE_VER0 || flags))
usize < SCHED_ATTR_SIZE_VER0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!p)
return -ESRCH;
if (flags) {
if (!task_has_dl_policy(p) ||
flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
return -EINVAL;
}
retval = security_task_getscheduler(p);
if (retval)
return retval;
@@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
get_params(p, &kattr);
get_params(p, &kattr, flags);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK

View File

@@ -4,6 +4,7 @@
*/
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
#include <linux/bsearch.h>
#include "sched.h"
@@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl,
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
u64 now = sched_clock();
sd_weight = cpumask_weight(tl->mask(tl, cpu));
@@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl,
.newidle_call = 512,
.newidle_success = 256,
.newidle_ratio = 512,
.newidle_stamp = now,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,