diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea30..f7aaf01f7cd5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e3a081a07c6d..5c33ab20cc20 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; @@ -2993,7 +2994,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,39 +3039,42 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - reset_migrate_dl_data(cs); - ret = -EINVAL; - goto out_unlock; - } - - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) { - reset_migrate_dl_data(cs); - goto out_unlock; - } - - cs->dl_bw_cpu = cpu; + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; } + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; + + cs->dl_bw_cpu = cpu; + out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; + out_unlock: + if (ret) + reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); return ret; } @@ -4176,11 +4180,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. - * Otherwise, no. + * If the current task is PF_EXITING, yes. Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed. + * unless the task has been OOM killed or is exiting. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -4194,7 +4198,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). + * in interrupt, of course). The PF_EXITING check must therefore + * come before the __GFP_HARDWALL check, otherwise a dying task + * would be blocked on the fast path. * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -4204,6 +4210,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok + * PF_EXITING - any node ok (let dying task exit quickly) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -4223,11 +4230,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) */ if (unlikely(tsk_is_oom_victim(current))) return true; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return false; - if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return false; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f271..4753a67d0f0f 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = NULL; continue; } + pool = ERR_PTR(-ENOMEM); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..7db4c87df83b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 6a7295347e90..42f54936f4bb 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -106,8 +106,9 @@ int cg_read_strcmp(const char *cgroup, const char *control, /* Handle the case of comparing against empty string */ if (!expected) return -1; - else - size = strlen(expected) + 1; + + /* needs size > 1, otherwise cg_read() reads 0 bytes */ + size = (expected[0] == '\0') ? 2 : strlen(expected) + 1; buf = malloc(size); if (!buf) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh index 42a6628fb8bc..1c0444729e70 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -18,7 +18,7 @@ write_test() { echo "testing $interface $value" echo $value > $dir/$interface new=$(cat $dir/$interface) - [[ $value -ne $(cat $dir/$interface) ]] && { + [[ "$value" != "$new" ]] && { echo "$interface write $value failed: new:$new" exit 1 } diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index eeabd34bf083..12f59925500b 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -368,11 +368,15 @@ static int test_percpu_basic(const char *root) for (i = 0; i < 1000; i++) { child = cg_name_indexed(parent, "child", i); - if (!child) - return -1; - - if (cg_create(child)) + if (!child) { + ret = -1; goto cleanup_children; + } + + if (cg_create(child)) { + free(child); + goto cleanup_children; + } free(child); }