Merge tag 'cgroup-for-7.1-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - cpuset fixes:
     - Partition invalidation could return CPUs still in use by sibling
       partitions, producing overlapping effective_cpus
     - cpuset_can_attach() over-reserved DL bandwidth on moves that
       stayed within the same root domain
     - Pending DL migration state leaked into later attaches when a
       later can_attach() check failed
     - Reorder PF_EXITING and __GFP_HARDWALL checks so dying tasks can
       allocate from any node and exit quickly

 - dmem: propagate -ENOMEM instead of spinning forever when the fallback
   pool allocation also fails

 - selftests/cgroup: percpu test error-path leak, bogus numeric
   comparison of cpuset strings, and a zero-length read() that silently
   passed OOM-kill tests

* tag 'cgroup-for-7.1-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/cpuset: Return only actually allocated CPUs during partition invalidation
  selftests/cgroup: Fix error path leaks in test_percpu_basic
  cgroup/cpuset: Reserve DL bandwidth only for root-domain moves
  cgroup/cpuset: Reset DL migration state on can_attach() failure
  selftests/cgroup: Fix string comparison in write_test
  selftests/cgroup: Fix cg_read_strcmp() empty string comparison
  cgroup/dmem: Return -ENOMEM on failed pool preallocation
  cgroup/cpuset: move PF_EXITING check before __GFP_HARDWALL in cpuset_current_node_allowed()
This commit is contained in:
Linus Torvalds
2026-05-13 14:56:31 -07:00
8 changed files with 65 additions and 36 deletions

View File

@@ -33,6 +33,15 @@ struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
extern void dl_clear_root_domain_cpu(int cpu);
/*
* Return whether moving DL task @p to @new_mask requires moving DL
* bandwidth accounting between root domains. This helper is specific to
* DL bandwidth move accounting semantics and is shared by
* cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the
* same source root-domain test.
*/
extern bool dl_task_needs_bw_move(struct task_struct *p,
const struct cpumask *new_mask);
extern u64 dl_cookie;
extern bool dl_bw_visited(int cpu, u64 cookie);

View File

@@ -167,6 +167,7 @@ struct cpuset {
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
/* DL bandwidth that needs destination reservation for this attach. */
u64 sum_migrate_dl_bw;
/*
* CPU used for temporary DL bandwidth allocation during attach;

View File

@@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
cs->effective_xcpus,
parent->effective_xcpus);
if (old_prs > 0)
new_prs = -old_prs;
@@ -2993,7 +2994,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
int ret;
int cpu, ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
@@ -3038,39 +3039,42 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}
if (dl_task(task)) {
/*
* Count all migrating DL tasks for cpuset task accounting.
* Only tasks that need a root-domain bandwidth move
* contribute to sum_migrate_dl_bw.
*/
cs->nr_migrate_dl_tasks++;
cs->sum_migrate_dl_bw += task->dl.dl_bw;
if (dl_task_needs_bw_move(task, cs->effective_cpus))
cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}
if (!cs->nr_migrate_dl_tasks)
if (!cs->sum_migrate_dl_bw)
goto out_success;
if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
if (unlikely(cpu >= nr_cpu_ids)) {
reset_migrate_dl_data(cs);
ret = -EINVAL;
goto out_unlock;
}
ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
if (ret) {
reset_migrate_dl_data(cs);
goto out_unlock;
}
cs->dl_bw_cpu = cpu;
cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
if (unlikely(cpu >= nr_cpu_ids)) {
ret = -EINVAL;
goto out_unlock;
}
ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
if (ret)
goto out_unlock;
cs->dl_bw_cpu = cpu;
out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
out_unlock:
if (ret)
reset_migrate_dl_data(cs);
mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -4176,11 +4180,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
* yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
* If the current task is PF_EXITING, yes. Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
* unless the task has been OOM killed.
* unless the task has been OOM killed or is exiting.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -4194,7 +4198,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
* so no allocation on a node outside the cpuset is allowed (unless
* in interrupt, of course).
* in interrupt, of course). The PF_EXITING check must therefore
* come before the __GFP_HARDWALL check, otherwise a dying task
* would be blocked on the fast path.
*
* The second pass through get_page_from_freelist() doesn't even call
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -4204,6 +4210,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
* tsk_is_oom_victim - any node ok
* PF_EXITING - any node ok (let dying task exit quickly)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -4223,11 +4230,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
*/
if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
if (current->flags & PF_EXITING) /* Let dying task have memory */
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);

View File

@@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = NULL;
continue;
}
pool = ERR_PTR(-ENOMEM);
}
}

View File

@@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
/*
* Migrating a SCHED_DEADLINE task between exclusive
* cpusets (different root_domains) entails a bandwidth
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
if (dl_task_needs_bw_move(p, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p,
set_cpus_allowed_common(p, ctx);
}
bool dl_task_needs_bw_move(struct task_struct *p,
const struct cpumask *new_mask)
{
if (!dl_task(p))
return false;
return !cpumask_intersects(task_rq(p)->rd->span, new_mask);
}
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{

View File

@@ -106,8 +106,9 @@ int cg_read_strcmp(const char *cgroup, const char *control,
/* Handle the case of comparing against empty string */
if (!expected)
return -1;
else
size = strlen(expected) + 1;
/* needs size > 1, otherwise cg_read() reads 0 bytes */
size = (expected[0] == '\0') ? 2 : strlen(expected) + 1;
buf = malloc(size);
if (!buf)

View File

@@ -18,7 +18,7 @@ write_test() {
echo "testing $interface $value"
echo $value > $dir/$interface
new=$(cat $dir/$interface)
[[ $value -ne $(cat $dir/$interface) ]] && {
[[ "$value" != "$new" ]] && {
echo "$interface write $value failed: new:$new"
exit 1
}

View File

@@ -368,11 +368,15 @@ static int test_percpu_basic(const char *root)
for (i = 0; i < 1000; i++) {
child = cg_name_indexed(parent, "child", i);
if (!child)
return -1;
if (cg_create(child))
if (!child) {
ret = -1;
goto cleanup_children;
}
if (cg_create(child)) {
free(child);
goto cleanup_children;
}
free(child);
}