mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-13 03:38:10 -04:00
Merge v6.13 into drm-next
A regression was caused by commite4b5ccd392("drm/v3d: Ensure job pointer is set to NULL after job completion"), but this commit is not yet in next-fixes, fast-forward it. Note that this recreates Linus merge in96c84703f1("Merge tag 'drm-next-2025-01-17' of https://gitlab.freedesktop.org/drm/kernel") because I didn't want to backmerge a random point in the merge window. Signed-off-by: Simona Vetter <simona.vetter@ffwll.ch>
This commit is contained in:
@@ -197,10 +197,8 @@ static struct cpuset top_cpuset = {
|
||||
|
||||
/*
|
||||
* There are two global locks guarding cpuset structures - cpuset_mutex and
|
||||
* callback_lock. We also require taking task_lock() when dereferencing a
|
||||
* task's cpuset pointer. See "The task_lock() exception", at the end of this
|
||||
* comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
|
||||
* can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
|
||||
* callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
|
||||
* subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
|
||||
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
|
||||
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
|
||||
* correctness.
|
||||
@@ -229,9 +227,6 @@ static struct cpuset top_cpuset = {
|
||||
* The cpuset_common_seq_show() handlers only hold callback_lock across
|
||||
* small pieces of code, such as when reading out possibly multi-word
|
||||
* cpumasks and nodemasks.
|
||||
*
|
||||
* Accessing a task's cpuset should be done in accordance with the
|
||||
* guidelines for accessing subsystem state in kernel/cgroup.c
|
||||
*/
|
||||
|
||||
static DEFINE_MUTEX(cpuset_mutex);
|
||||
@@ -890,7 +885,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
*/
|
||||
if (cgrpv2) {
|
||||
for (i = 0; i < ndoms; i++) {
|
||||
cpumask_copy(doms[i], csa[i]->effective_cpus);
|
||||
/*
|
||||
* The top cpuset may contain some boot time isolated
|
||||
* CPUs that need to be excluded from the sched domain.
|
||||
*/
|
||||
if (csa[i] == &top_cpuset)
|
||||
cpumask_and(doms[i], csa[i]->effective_cpus,
|
||||
housekeeping_cpumask(HK_TYPE_DOMAIN));
|
||||
else
|
||||
cpumask_copy(doms[i], csa[i]->effective_cpus);
|
||||
if (dattr)
|
||||
dattr[i] = SD_ATTR_INIT;
|
||||
}
|
||||
@@ -3121,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
||||
int retval = -ENODEV;
|
||||
|
||||
buf = strstrip(buf);
|
||||
|
||||
/*
|
||||
* CPU or memory hotunplug may leave @cs w/o any execution
|
||||
* resources, in which case the hotplug code asynchronously updates
|
||||
* configuration and transfers all tasks to the nearest ancestor
|
||||
* which can execute.
|
||||
*
|
||||
* As writes to "cpus" or "mems" may restore @cs's execution
|
||||
* resources, wait for the previously scheduled operations before
|
||||
* proceeding, so that we don't end up keep removing tasks added
|
||||
* after execution capability is restored.
|
||||
*
|
||||
* cpuset_handle_hotplug may call back into cgroup core asynchronously
|
||||
* via cgroup_transfer_tasks() and waiting for it from a cgroupfs
|
||||
* operation like this one can lead to a deadlock through kernfs
|
||||
* active_ref protection. Let's break the protection. Losing the
|
||||
* protection is okay as we check whether @cs is online after
|
||||
* grabbing cpuset_mutex anyway. This only happens on the legacy
|
||||
* hierarchies.
|
||||
*/
|
||||
css_get(&cs->css);
|
||||
kernfs_break_active_protection(of->kn);
|
||||
|
||||
cpus_read_lock();
|
||||
mutex_lock(&cpuset_mutex);
|
||||
if (!is_cpuset_online(cs))
|
||||
@@ -3176,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
||||
out_unlock:
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
cpus_read_unlock();
|
||||
kernfs_unbreak_active_protection(of->kn);
|
||||
css_put(&cs->css);
|
||||
flush_workqueue(cpuset_migrate_mm_wq);
|
||||
return retval ?: nbytes;
|
||||
}
|
||||
|
||||
@@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
||||
},
|
||||
[CPUHP_AP_HRTIMERS_DYING] = {
|
||||
.name = "hrtimers:dying",
|
||||
.startup.single = NULL,
|
||||
.startup.single = hrtimers_cpu_starting,
|
||||
.teardown.single = hrtimers_cpu_dying,
|
||||
},
|
||||
[CPUHP_AP_TICK_DYING] = {
|
||||
|
||||
@@ -1915,6 +1915,7 @@ void uprobe_free_utask(struct task_struct *t)
|
||||
if (!utask)
|
||||
return;
|
||||
|
||||
t->utask = NULL;
|
||||
WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
|
||||
|
||||
timer_delete_sync(&utask->ri_timer);
|
||||
@@ -1924,7 +1925,6 @@ void uprobe_free_utask(struct task_struct *t)
|
||||
ri = free_ret_instance(ri, true /* cleanup_hprobe */);
|
||||
|
||||
kfree(utask);
|
||||
t->utask = NULL;
|
||||
}
|
||||
|
||||
#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
|
||||
|
||||
@@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 |
|
||||
|
||||
# Create archive and try to normalize metadata for reproducibility.
|
||||
tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
|
||||
--exclude=".__afs*" --exclude=".nfs*" \
|
||||
--owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
|
||||
-I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
|
||||
|
||||
|
||||
@@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
|
||||
bool prev_on_scx = prev->sched_class == &ext_sched_class;
|
||||
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
|
||||
int nr_loops = SCX_DSP_MAX_LOOPS;
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
@@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
* See scx_ops_disable_workfn() for the explanation on the
|
||||
* bypassing test.
|
||||
*/
|
||||
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
|
||||
prev->scx.slice && !scx_rq_bypassing(rq)) {
|
||||
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
|
||||
rq->scx.flags |= SCX_RQ_BAL_KEEP;
|
||||
goto has_tasks;
|
||||
}
|
||||
@@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
|
||||
flush_dispatch_buf(rq);
|
||||
|
||||
if (prev_on_rq && prev->scx.slice) {
|
||||
rq->scx.flags |= SCX_RQ_BAL_KEEP;
|
||||
goto has_tasks;
|
||||
}
|
||||
if (rq->scx.local_dsq.nr)
|
||||
goto has_tasks;
|
||||
if (consume_global_dsq(rq))
|
||||
@@ -2838,8 +2842,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
* Didn't find another task to run. Keep running @prev unless
|
||||
* %SCX_OPS_ENQ_LAST is in effect.
|
||||
*/
|
||||
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
|
||||
(!static_branch_unlikely(&scx_ops_enq_last) ||
|
||||
if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
|
||||
scx_rq_bypassing(rq))) {
|
||||
rq->scx.flags |= SCX_RQ_BAL_KEEP;
|
||||
goto has_tasks;
|
||||
@@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
|
||||
*/
|
||||
if (p->scx.slice && !scx_rq_bypassing(rq)) {
|
||||
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
|
||||
return;
|
||||
goto switch_class;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
|
||||
}
|
||||
}
|
||||
|
||||
switch_class:
|
||||
if (next && next->sched_class != &ext_sched_class)
|
||||
switch_class(rq, next);
|
||||
}
|
||||
@@ -3586,16 +3590,8 @@ static void reset_idle_masks(void)
|
||||
cpumask_copy(idle_masks.smt, cpu_online_mask);
|
||||
}
|
||||
|
||||
void __scx_update_idle(struct rq *rq, bool idle)
|
||||
static void update_builtin_idle(int cpu, bool idle)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
|
||||
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
||||
if (!static_branch_unlikely(&scx_builtin_idle_enabled))
|
||||
return;
|
||||
}
|
||||
|
||||
if (idle)
|
||||
cpumask_set_cpu(cpu, idle_masks.cpu);
|
||||
else
|
||||
@@ -3622,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the idle state of a CPU to @idle.
|
||||
*
|
||||
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
|
||||
* scheduler of an actual idle state transition (idle to busy or vice
|
||||
* versa). If @do_notify is false, only the idle state in the idle masks is
|
||||
* refreshed without invoking ops.update_idle().
|
||||
*
|
||||
* This distinction is necessary, because an idle CPU can be "reserved" and
|
||||
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
|
||||
* busy even if no tasks are dispatched. In this case, the CPU may return
|
||||
* to idle without a true state transition. Refreshing the idle masks
|
||||
* without invoking ops.update_idle() ensures accurate idle state tracking
|
||||
* while avoiding unnecessary updates and maintaining balanced state
|
||||
* transitions.
|
||||
*/
|
||||
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
/*
|
||||
* Trigger ops.update_idle() only when transitioning from a task to
|
||||
* the idle thread and vice versa.
|
||||
*
|
||||
* Idle transitions are indicated by do_notify being set to true,
|
||||
* managed by put_prev_task_idle()/set_next_task_idle().
|
||||
*/
|
||||
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
|
||||
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
||||
|
||||
/*
|
||||
* Update the idle masks:
|
||||
* - for real idle transitions (do_notify == true)
|
||||
* - for idle-to-idle transitions (indicated by the previous task
|
||||
* being the idle thread, managed by pick_task_idle())
|
||||
*
|
||||
* Skip updating idle masks if the previous task is not the idle
|
||||
* thread, since set_next_task_idle() has already handled it when
|
||||
* transitioning from a task to the idle thread (calling this
|
||||
* function with do_notify == true).
|
||||
*
|
||||
* In this way we can avoid updating the idle masks twice,
|
||||
* unnecessarily.
|
||||
*/
|
||||
if (static_branch_likely(&scx_builtin_idle_enabled))
|
||||
if (do_notify || is_idle_task(rq->curr))
|
||||
update_builtin_idle(cpu, idle);
|
||||
}
|
||||
|
||||
static void handle_hotplug(struct rq *rq, bool online)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
@@ -4744,10 +4791,9 @@ static void scx_ops_bypass(bool bypass)
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
struct task_struct *p, *n;
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
raw_spin_rq_lock(rq);
|
||||
|
||||
if (bypass) {
|
||||
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
|
||||
@@ -4763,7 +4809,7 @@ static void scx_ops_bypass(bool bypass)
|
||||
* sees scx_rq_bypassing() before moving tasks to SCX.
|
||||
*/
|
||||
if (!scx_enabled()) {
|
||||
rq_unlock(rq, &rf);
|
||||
raw_spin_rq_unlock(rq);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -4783,10 +4829,11 @@ static void scx_ops_bypass(bool bypass)
|
||||
sched_enq_and_set_task(&ctx);
|
||||
}
|
||||
|
||||
rq_unlock(rq, &rf);
|
||||
|
||||
/* resched to restore ticks and idle state */
|
||||
resched_cpu(cpu);
|
||||
if (cpu_online(cpu) || cpu == smp_processor_id())
|
||||
resched_curr(rq);
|
||||
|
||||
raw_spin_rq_unlock(rq);
|
||||
}
|
||||
|
||||
atomic_dec(&scx_ops_breather_depth);
|
||||
|
||||
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
|
||||
void __scx_update_idle(struct rq *rq, bool idle);
|
||||
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
|
||||
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle)
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
||||
{
|
||||
if (scx_enabled())
|
||||
__scx_update_idle(rq, idle);
|
||||
__scx_update_idle(rq, idle, do_notify);
|
||||
}
|
||||
#else
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle) {}
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
|
||||
@@ -689,21 +689,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
*
|
||||
* XXX could add max_slice to the augmented data to track this.
|
||||
*/
|
||||
static s64 entity_lag(u64 avruntime, struct sched_entity *se)
|
||||
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
s64 vlag, limit;
|
||||
|
||||
vlag = avruntime - se->vruntime;
|
||||
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
|
||||
|
||||
return clamp(vlag, -limit, limit);
|
||||
}
|
||||
|
||||
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
SCHED_WARN_ON(!se->on_rq);
|
||||
|
||||
se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
|
||||
vlag = avg_vruntime(cfs_rq) - se->vruntime;
|
||||
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
|
||||
|
||||
se->vlag = clamp(vlag, -limit, limit);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3774,137 +3769,32 @@ static inline void
|
||||
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
||||
#endif
|
||||
|
||||
static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
|
||||
unsigned long weight)
|
||||
{
|
||||
unsigned long old_weight = se->load.weight;
|
||||
s64 vlag, vslice;
|
||||
|
||||
/*
|
||||
* VRUNTIME
|
||||
* --------
|
||||
*
|
||||
* COROLLARY #1: The virtual runtime of the entity needs to be
|
||||
* adjusted if re-weight at !0-lag point.
|
||||
*
|
||||
* Proof: For contradiction assume this is not true, so we can
|
||||
* re-weight without changing vruntime at !0-lag point.
|
||||
*
|
||||
* Weight VRuntime Avg-VRuntime
|
||||
* before w v V
|
||||
* after w' v' V'
|
||||
*
|
||||
* Since lag needs to be preserved through re-weight:
|
||||
*
|
||||
* lag = (V - v)*w = (V'- v')*w', where v = v'
|
||||
* ==> V' = (V - v)*w/w' + v (1)
|
||||
*
|
||||
* Let W be the total weight of the entities before reweight,
|
||||
* since V' is the new weighted average of entities:
|
||||
*
|
||||
* V' = (WV + w'v - wv) / (W + w' - w) (2)
|
||||
*
|
||||
* by using (1) & (2) we obtain:
|
||||
*
|
||||
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
|
||||
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
|
||||
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
|
||||
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
|
||||
*
|
||||
* Since we are doing at !0-lag point which means V != v, we
|
||||
* can simplify (3):
|
||||
*
|
||||
* ==> W / (W + w' - w) = w / w'
|
||||
* ==> Ww' = Ww + ww' - ww
|
||||
* ==> W * (w' - w) = w * (w' - w)
|
||||
* ==> W = w (re-weight indicates w' != w)
|
||||
*
|
||||
* So the cfs_rq contains only one entity, hence vruntime of
|
||||
* the entity @v should always equal to the cfs_rq's weighted
|
||||
* average vruntime @V, which means we will always re-weight
|
||||
* at 0-lag point, thus breach assumption. Proof completed.
|
||||
*
|
||||
*
|
||||
* COROLLARY #2: Re-weight does NOT affect weighted average
|
||||
* vruntime of all the entities.
|
||||
*
|
||||
* Proof: According to corollary #1, Eq. (1) should be:
|
||||
*
|
||||
* (V - v)*w = (V' - v')*w'
|
||||
* ==> v' = V' - (V - v)*w/w' (4)
|
||||
*
|
||||
* According to the weighted average formula, we have:
|
||||
*
|
||||
* V' = (WV - wv + w'v') / (W - w + w')
|
||||
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
|
||||
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
|
||||
* = (WV + w'V' - Vw) / (W - w + w')
|
||||
*
|
||||
* ==> V'*(W - w + w') = WV + w'V' - Vw
|
||||
* ==> V' * (W - w) = (W - w) * V (5)
|
||||
*
|
||||
* If the entity is the only one in the cfs_rq, then reweight
|
||||
* always occurs at 0-lag point, so V won't change. Or else
|
||||
* there are other entities, hence W != w, then Eq. (5) turns
|
||||
* into V' = V. So V won't change in either case, proof done.
|
||||
*
|
||||
*
|
||||
* So according to corollary #1 & #2, the effect of re-weight
|
||||
* on vruntime should be:
|
||||
*
|
||||
* v' = V' - (V - v) * w / w' (4)
|
||||
* = V - (V - v) * w / w'
|
||||
* = V - vl * w / w'
|
||||
* = V - vl'
|
||||
*/
|
||||
if (avruntime != se->vruntime) {
|
||||
vlag = entity_lag(avruntime, se);
|
||||
vlag = div_s64(vlag * old_weight, weight);
|
||||
se->vruntime = avruntime - vlag;
|
||||
}
|
||||
|
||||
/*
|
||||
* DEADLINE
|
||||
* --------
|
||||
*
|
||||
* When the weight changes, the virtual time slope changes and
|
||||
* we should adjust the relative virtual deadline accordingly.
|
||||
*
|
||||
* d' = v' + (d - v)*w/w'
|
||||
* = V' - (V - v)*w/w' + (d - v)*w/w'
|
||||
* = V - (V - v)*w/w' + (d - v)*w/w'
|
||||
* = V + (d - V)*w/w'
|
||||
*/
|
||||
vslice = (s64)(se->deadline - avruntime);
|
||||
vslice = div_s64(vslice * old_weight, weight);
|
||||
se->deadline = avruntime + vslice;
|
||||
}
|
||||
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
|
||||
|
||||
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
unsigned long weight)
|
||||
{
|
||||
bool curr = cfs_rq->curr == se;
|
||||
u64 avruntime;
|
||||
|
||||
if (se->on_rq) {
|
||||
/* commit outstanding execution time */
|
||||
update_curr(cfs_rq);
|
||||
avruntime = avg_vruntime(cfs_rq);
|
||||
update_entity_lag(cfs_rq, se);
|
||||
se->deadline -= se->vruntime;
|
||||
se->rel_deadline = 1;
|
||||
if (!curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
update_load_sub(&cfs_rq->load, se->load.weight);
|
||||
}
|
||||
dequeue_load_avg(cfs_rq, se);
|
||||
|
||||
if (se->on_rq) {
|
||||
reweight_eevdf(se, avruntime, weight);
|
||||
} else {
|
||||
/*
|
||||
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
|
||||
* we need to scale se->vlag when w_i changes.
|
||||
*/
|
||||
se->vlag = div_s64(se->vlag * se->load.weight, weight);
|
||||
}
|
||||
/*
|
||||
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
|
||||
* we need to scale se->vlag when w_i changes.
|
||||
*/
|
||||
se->vlag = div_s64(se->vlag * se->load.weight, weight);
|
||||
if (se->rel_deadline)
|
||||
se->deadline = div_s64(se->deadline * se->load.weight, weight);
|
||||
|
||||
update_load_set(&se->load, weight);
|
||||
|
||||
@@ -3919,6 +3809,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
enqueue_load_avg(cfs_rq, se);
|
||||
if (se->on_rq) {
|
||||
update_load_add(&cfs_rq->load, se->load.weight);
|
||||
place_entity(cfs_rq, se, 0);
|
||||
if (!curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
|
||||
@@ -4065,7 +3956,11 @@ static void update_cfs_group(struct sched_entity *se)
|
||||
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
||||
long shares;
|
||||
|
||||
if (!gcfs_rq)
|
||||
/*
|
||||
* When a group becomes empty, preserve its weight. This matters for
|
||||
* DELAY_DEQUEUE.
|
||||
*/
|
||||
if (!gcfs_rq || !gcfs_rq->load.weight)
|
||||
return;
|
||||
|
||||
if (throttled_hierarchy(gcfs_rq))
|
||||
@@ -5359,7 +5254,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
|
||||
se->vruntime = vruntime - lag;
|
||||
|
||||
if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
|
||||
if (se->rel_deadline) {
|
||||
se->deadline += se->vruntime;
|
||||
se->rel_deadline = 0;
|
||||
return;
|
||||
|
||||
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
dl_server_update_idle_time(rq, prev);
|
||||
scx_update_idle(rq, false);
|
||||
scx_update_idle(rq, false, true);
|
||||
}
|
||||
|
||||
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
|
||||
{
|
||||
update_idle_core(rq);
|
||||
scx_update_idle(rq, true);
|
||||
scx_update_idle(rq, true, true);
|
||||
schedstat_inc(rq->sched_goidle);
|
||||
next->se.exec_start = rq_clock_task(rq);
|
||||
}
|
||||
|
||||
struct task_struct *pick_task_idle(struct rq *rq)
|
||||
{
|
||||
scx_update_idle(rq, true, false);
|
||||
return rq->idle;
|
||||
}
|
||||
|
||||
|
||||
@@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
|
||||
|
||||
if (!list_empty(&q->list)) {
|
||||
/*
|
||||
* If task group is exiting with the signal already pending,
|
||||
* wait for __exit_signal() to do its job. Otherwise if
|
||||
* ignored, it's not supposed to be queued. Try to survive.
|
||||
* The signal was ignored and blocked. The timer
|
||||
* expiry queued it because blocked signals are
|
||||
* queued independent of the ignored state.
|
||||
*
|
||||
* The unblocking set SIGPENDING, but the signal
|
||||
* was not yet dequeued from the pending list.
|
||||
* So prepare_signal() sees unblocked and ignored,
|
||||
* which ends up here. Leave it queued like a
|
||||
* regular signal.
|
||||
*
|
||||
* The same happens when the task group is exiting
|
||||
* and the signal is already queued.
|
||||
* prepare_signal() treats SIGNAL_GROUP_EXIT as
|
||||
* ignored independent of its queued state. This
|
||||
* gets cleaned up in __exit_signal().
|
||||
*/
|
||||
WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* This should never happen and leaks a reference count */
|
||||
if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
|
||||
hlist_del_init(&tmr->ignored_list);
|
||||
|
||||
if (unlikely(!list_empty(&q->list))) {
|
||||
/* This holds a reference count already */
|
||||
result = TRACE_SIGNAL_ALREADY_PENDING;
|
||||
goto out;
|
||||
}
|
||||
|
||||
posixtimer_sigqueue_getref(q);
|
||||
/*
|
||||
* If the signal is on the ignore list, it got blocked after it was
|
||||
* ignored earlier. But nothing lifted the ignore. Move it back to
|
||||
* the pending list to be consistent with the regular signal
|
||||
* handling. This already holds a reference count.
|
||||
*
|
||||
* If it's not on the ignore list acquire a reference count.
|
||||
*/
|
||||
if (likely(hlist_unhashed(&tmr->ignored_list)))
|
||||
posixtimer_sigqueue_getref(q);
|
||||
else
|
||||
hlist_del_init(&tmr->ignored_list);
|
||||
|
||||
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
|
||||
result = TRACE_SIGNAL_DELIVERED;
|
||||
out:
|
||||
|
||||
@@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
|
||||
}
|
||||
|
||||
cpu_base->cpu = cpu;
|
||||
hrtimer_cpu_base_init_expiry_lock(cpu_base);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hrtimers_cpu_starting(unsigned int cpu)
|
||||
{
|
||||
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
|
||||
|
||||
/* Clear out any left over state from a CPU down operation */
|
||||
cpu_base->active_bases = 0;
|
||||
cpu_base->hres_active = 0;
|
||||
cpu_base->hang_detected = 0;
|
||||
@@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
|
||||
cpu_base->expires_next = KTIME_MAX;
|
||||
cpu_base->softirq_expires_next = KTIME_MAX;
|
||||
cpu_base->online = 1;
|
||||
hrtimer_cpu_base_init_expiry_lock(cpu_base);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
|
||||
void __init hrtimers_init(void)
|
||||
{
|
||||
hrtimers_prepare_cpu(smp_processor_id());
|
||||
hrtimers_cpu_starting(smp_processor_id());
|
||||
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
|
||||
}
|
||||
|
||||
@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
|
||||
break;
|
||||
|
||||
child = group;
|
||||
group = group->parent;
|
||||
/*
|
||||
* Pairs with the store release on group connection
|
||||
* to make sure group initialization is visible.
|
||||
*/
|
||||
group = READ_ONCE(group->parent);
|
||||
data->childmask = child->groupmask;
|
||||
WARN_ON_ONCE(!data->childmask);
|
||||
} while (group);
|
||||
}
|
||||
|
||||
@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
|
||||
while ((node = timerqueue_getnext(&group->events))) {
|
||||
evt = container_of(node, struct tmigr_event, nextevt);
|
||||
|
||||
if (!evt->ignore) {
|
||||
if (!READ_ONCE(evt->ignore)) {
|
||||
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
|
||||
return evt;
|
||||
}
|
||||
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
|
||||
* lock is held while updating the ignore flag in idle path. So this
|
||||
* state change will not be lost.
|
||||
*/
|
||||
group->groupevt.ignore = true;
|
||||
WRITE_ONCE(group->groupevt.ignore, true);
|
||||
|
||||
return walk_done;
|
||||
}
|
||||
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||
union tmigr_state childstate, groupstate;
|
||||
bool remote = data->remote;
|
||||
bool walk_done = false;
|
||||
bool ignore;
|
||||
u64 nextexp;
|
||||
|
||||
if (child) {
|
||||
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||
nextexp = child->next_expiry;
|
||||
evt = &child->groupevt;
|
||||
|
||||
evt->ignore = (nextexp == KTIME_MAX) ? true : false;
|
||||
/*
|
||||
* This can race with concurrent idle exit (activate).
|
||||
* If the current writer wins, a useless remote expiration may
|
||||
* be scheduled. If the activate wins, the event is properly
|
||||
* ignored.
|
||||
*/
|
||||
ignore = (nextexp == KTIME_MAX) ? true : false;
|
||||
WRITE_ONCE(evt->ignore, ignore);
|
||||
} else {
|
||||
nextexp = data->nextexp;
|
||||
|
||||
first_childevt = evt = data->evt;
|
||||
ignore = evt->ignore;
|
||||
|
||||
/*
|
||||
* Walking the hierarchy is required in any case when a
|
||||
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||
* first event information of the group is updated properly and
|
||||
* also handled properly, so skip this fast return path.
|
||||
*/
|
||||
if (evt->ignore && !remote && group->parent)
|
||||
if (ignore && !remote && group->parent)
|
||||
return true;
|
||||
|
||||
raw_spin_lock(&group->lock);
|
||||
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||
* queue when the expiry time changed only or when it could be ignored.
|
||||
*/
|
||||
if (timerqueue_node_queued(&evt->nextevt)) {
|
||||
if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
|
||||
if ((evt->nextevt.expires == nextexp) && !ignore) {
|
||||
/* Make sure not to miss a new CPU event with the same expiry */
|
||||
evt->cpu = first_childevt->cpu;
|
||||
goto check_toplvl;
|
||||
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||
WRITE_ONCE(group->next_expiry, KTIME_MAX);
|
||||
}
|
||||
|
||||
if (evt->ignore) {
|
||||
if (ignore) {
|
||||
/*
|
||||
* When the next child event could be ignored (nextexp is
|
||||
* KTIME_MAX) and there was no remote timer handling before or
|
||||
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
|
||||
s.seq = 0;
|
||||
atomic_set(&group->migr_state, s.state);
|
||||
|
||||
/*
|
||||
* If this is a new top-level, prepare its groupmask in advance.
|
||||
* This avoids accidents where yet another new top-level is
|
||||
* created in the future and made visible before the current groupmask.
|
||||
*/
|
||||
if (list_empty(&tmigr_level_list[lvl])) {
|
||||
group->groupmask = BIT(0);
|
||||
/*
|
||||
* The previous top level has prepared its groupmask already,
|
||||
* simply account it as the first child.
|
||||
*/
|
||||
if (lvl > 0)
|
||||
group->num_children = 1;
|
||||
}
|
||||
|
||||
timerqueue_init_head(&group->events);
|
||||
timerqueue_init(&group->groupevt.nextevt);
|
||||
group->groupevt.nextevt.expires = KTIME_MAX;
|
||||
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
|
||||
raw_spin_lock_irq(&child->lock);
|
||||
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
|
||||
|
||||
child->parent = parent;
|
||||
child->groupmask = BIT(parent->num_children++);
|
||||
if (activate) {
|
||||
/*
|
||||
* @child is the old top and @parent the new one. In this
|
||||
* case groupmask is pre-initialized and @child already
|
||||
* accounted, along with its new sibling corresponding to the
|
||||
* CPU going up.
|
||||
*/
|
||||
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
|
||||
} else {
|
||||
/* Adding @child for the CPU going up to @parent. */
|
||||
child->groupmask = BIT(parent->num_children++);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure parent initialization is visible before publishing it to a
|
||||
* racing CPU entering/exiting idle. This RELEASE barrier enforces an
|
||||
* address dependency that pairs with the READ_ONCE() in __walk_groups().
|
||||
*/
|
||||
smp_store_release(&child->parent, parent);
|
||||
|
||||
raw_spin_unlock(&parent->lock);
|
||||
raw_spin_unlock_irq(&child->lock);
|
||||
|
||||
@@ -4122,6 +4122,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
|
||||
preempt_model_none() ? "server" :
|
||||
preempt_model_voluntary() ? "desktop" :
|
||||
preempt_model_full() ? "preempt" :
|
||||
preempt_model_lazy() ? "lazy" :
|
||||
preempt_model_rt() ? "preempt_rt" :
|
||||
"unknown",
|
||||
/* These are reserved for later use */
|
||||
|
||||
@@ -182,6 +182,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
|
||||
struct trace_array_cpu *data;
|
||||
unsigned long flags;
|
||||
unsigned int trace_ctx;
|
||||
u64 *calltime;
|
||||
int ret;
|
||||
|
||||
if (ftrace_graph_ignore_func(gops, trace))
|
||||
@@ -199,6 +200,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
|
||||
if (!func_prolog_dec(tr, &data, &flags))
|
||||
return 0;
|
||||
|
||||
calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
|
||||
if (!calltime)
|
||||
return 0;
|
||||
|
||||
*calltime = trace_clock_local();
|
||||
|
||||
trace_ctx = tracing_gen_ctx_flags(flags);
|
||||
ret = __trace_graph_entry(tr, trace, trace_ctx);
|
||||
atomic_dec(&data->disabled);
|
||||
@@ -213,12 +220,19 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
|
||||
struct trace_array_cpu *data;
|
||||
unsigned long flags;
|
||||
unsigned int trace_ctx;
|
||||
u64 *calltime;
|
||||
int size;
|
||||
|
||||
ftrace_graph_addr_finish(gops, trace);
|
||||
|
||||
if (!func_prolog_dec(tr, &data, &flags))
|
||||
return;
|
||||
|
||||
calltime = fgraph_retrieve_data(gops->idx, &size);
|
||||
if (!calltime)
|
||||
return;
|
||||
trace->calltime = *calltime;
|
||||
|
||||
trace_ctx = tracing_gen_ctx_flags(flags);
|
||||
__trace_graph_return(tr, trace, trace_ctx);
|
||||
atomic_dec(&data->disabled);
|
||||
|
||||
@@ -940,8 +940,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
|
||||
}
|
||||
/* a symbol specified */
|
||||
symbol = kstrdup(argv[1], GFP_KERNEL);
|
||||
if (!symbol)
|
||||
return -ENOMEM;
|
||||
if (!symbol) {
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
tmp = strchr(symbol, '%');
|
||||
if (tmp) {
|
||||
|
||||
@@ -118,6 +118,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
|
||||
struct trace_array *tr = wakeup_trace;
|
||||
struct trace_array_cpu *data;
|
||||
unsigned int trace_ctx;
|
||||
u64 *calltime;
|
||||
int ret = 0;
|
||||
|
||||
if (ftrace_graph_ignore_func(gops, trace))
|
||||
@@ -135,6 +136,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
|
||||
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
|
||||
return 0;
|
||||
|
||||
calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
|
||||
if (!calltime)
|
||||
return 0;
|
||||
|
||||
*calltime = trace_clock_local();
|
||||
|
||||
ret = __trace_graph_entry(tr, trace, trace_ctx);
|
||||
atomic_dec(&data->disabled);
|
||||
preempt_enable_notrace();
|
||||
@@ -148,12 +155,19 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace,
|
||||
struct trace_array *tr = wakeup_trace;
|
||||
struct trace_array_cpu *data;
|
||||
unsigned int trace_ctx;
|
||||
u64 *calltime;
|
||||
int size;
|
||||
|
||||
ftrace_graph_addr_finish(gops, trace);
|
||||
|
||||
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
|
||||
return;
|
||||
|
||||
calltime = fgraph_retrieve_data(gops->idx, &size);
|
||||
if (!calltime)
|
||||
return;
|
||||
trace->calltime = *calltime;
|
||||
|
||||
__trace_graph_return(tr, trace, trace_ctx);
|
||||
atomic_dec(&data->disabled);
|
||||
|
||||
|
||||
@@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
|
||||
return;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
|
||||
dwork->wq = wq;
|
||||
dwork->cpu = cpu;
|
||||
timer->expires = jiffies + delay;
|
||||
@@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
|
||||
* @dwork: work to queue
|
||||
* @delay: number of jiffies to wait before queueing
|
||||
*
|
||||
* We queue the delayed_work to a specific CPU, for non-zero delays the
|
||||
* caller must ensure it is online and can't go away. Callers that fail
|
||||
* to ensure this, may get @dwork->timer queued to an offlined CPU and
|
||||
* this will prevent queueing of @dwork->work unless the offlined CPU
|
||||
* becomes online again.
|
||||
*
|
||||
* Return: %false if @work was already on a queue, %true otherwise. If
|
||||
* @delay is zero and @dwork is idle, it will be scheduled for immediate
|
||||
* execution.
|
||||
|
||||
Reference in New Issue
Block a user