From 8fedac321fb0fb368d4c14674e2a64852b4f225e Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 7 Apr 2026 07:14:14 -0700 Subject: [PATCH 1/7] mm/mempolicy: fix weighted interleave auto sysfs name The __ATTR macro is a utility that makes defining kobj_attributes easier by stringfying the name, verifying the mode, and setting the show/store fields in a single initializer. It takes a raw token as the first value, rather than a string, so that __ATTR family macros like __ATTR_RW can token-paste it for inferring the _show / _store function names. Commit e341f9c3c841 ("mm/mempolicy: Weighted Interleave Auto-tuning") used the __ATTR macro to define the "auto" sysfs for weighted interleave. A few months later, commit 2fb6915fa22d ("compiler_types.h: add "auto" as a macro for "__auto_type"") introduced a #define macro which expanded auto into __auto_type. This led to the "auto" token passed into __ATTR to be expanded out into __auto_type, and the sysfs entry to be displayed as __auto_type as well. Expand out the __ATTR macro and directly pass a string "auto" instead of the raw token 'auto' to prevent it from being expanded out. Also bypass the VERIFY_OCTAL_PERMISSIONS check by triple checking that 0664 is indeed the intended permissions for this sysfs file. Before: $ ls /sys/kernel/mm/mempolicy/weighted_interleave __auto_type node0 After: $ ls /sys/kernel/mm/mempolicy/weighted_interleave/ auto node0 Link: https://lore.kernel.org/20260407141415.3080960-1-joshua.hahnjy@gmail.com Fixes: 2fb6915fa22d ("compiler_types.h: add "auto" as a macro for "__auto_type"") Signed-off-by: Joshua Hahn Reviewed-by: Gregory Price Reviewed-by: Rakie Kim Acked-by: David Hildenbrand (Arm) Acked-by: Zi Yan Cc: Alistair Popple Cc: Byungchul Park Cc: "Huang, Ying" Cc: Matthew Brost Cc: Rakie Kim Cc: Ying Huang Signed-off-by: Andrew Morton --- mm/mempolicy.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e5175f1c767..5c03bdbb7215 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3781,9 +3781,11 @@ static void wi_state_free(void) } } -static struct kobj_attribute wi_auto_attr = - __ATTR(auto, 0664, weighted_interleave_auto_show, - weighted_interleave_auto_store); +static struct kobj_attribute wi_auto_attr = { + .attr = { .name = "auto", .mode = 0664 }, + .show = weighted_interleave_auto_show, + .store = weighted_interleave_auto_store, +}; static void wi_cleanup(void) { sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); From 8bbde987c2b84f80da0853f739f0a920386f8b99 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Apr 2026 17:31:52 -0700 Subject: [PATCH 2/7] mm/damon/core: disallow time-quota setting zero esz When the throughput of a DAMOS scheme is very slow, DAMOS time quota can make the effective size quota smaller than damon_ctx->min_region_sz. In the case, damos_apply_scheme() will skip applying the action, because the action is tried at region level, which requires >=min_region_sz size. That is, the quota is effectively exceeded for the quota charge window. Because no action will be applied, the total_charged_sz and total_charged_ns are also not updated. damos_set_effective_quota() will try to update the effective size quota before starting the next charge window. However, because the total_charged_sz and total_charged_ns have not updated, the throughput and effective size quota are also not changed. Since effective size quota can only be decreased, other effective size quota update factors including DAMOS quota goals and size quota cannot make any change, either. As a result, the scheme is unexpectedly deactivated until the user notices and mitigates the situation. The users can mitigate this situation by changing the time quota online or re-install the scheme. While the mitigation is somewhat straightforward, finding the situation would be challenging, because DAMON is not providing good observabilities for that. Even if such observability is provided, doing the additional monitoring and the mitigation is somewhat cumbersome and not aligned to the intention of the time quota. The time quota was intended to help reduce the user's administration overhead. Fix the problem by setting time quota-modified effective size quota be at least min_region_sz always. The issue was discovered [1] by sashiko. Link: https://lore.kernel.org/20260407003153.79589-1-sj@kernel.org Link: https://lore.kernel.org/20260405192504.110014-1-sj@kernel.org [1] Fixes: 1cd243030059 ("mm/damon/schemes: implement time quota") Signed-off-by: SeongJae Park Cc: # 5.16.x Signed-off-by: Andrew Morton --- mm/damon/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3e1890d64d06..3703f62a876b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2228,7 +2228,8 @@ static unsigned long damos_quota_score(struct damos_quota *quota) /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota) +static void damos_set_effective_quota(struct damos_quota *quota, + struct damon_ctx *ctx) { unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2254,6 +2255,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); + esz = max(ctx->min_region_sz, esz); } if (quota->sz && quota->sz < esz) @@ -2290,7 +2292,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); } /* New charge window starts */ @@ -2303,7 +2305,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } From 39928984956037cabd304321cb8f342e47421db5 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 10 Apr 2026 16:03:46 -0700 Subject: [PATCH 3/7] mm/zone_device: do not touch device folio after calling ->folio_free() The contents of a device folio can immediately change after calling ->folio_free(), as the folio may be reallocated by a driver with a different order. Instead of touching the folio again to extract the pgmap, use the local stack variable when calling percpu_ref_put_many(). Link: https://lore.kernel.org/20260410230346.4009855-1-matthew.brost@intel.com Fixes: d245f9b4ab80 ("mm/zone_device: support large zone device private folios") Signed-off-by: Matthew Brost Reviewed-by: Balbir Singh Reviewed-by: Vishal Moola Reviewed-by: Alistair Popple Cc: David Hildenbrand Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index ac7be07e3361..053842d45cb1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -454,7 +454,7 @@ void free_zone_device_folio(struct folio *folio) if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; pgmap->ops->folio_free(folio); - percpu_ref_put_many(&folio->pgmap->ref, nr); + percpu_ref_put_many(&pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: From 8f5857be99f1ed1fa80991c72449541f634626ee Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 13 Apr 2026 03:09:19 -0700 Subject: [PATCH 4/7] mm: blk-cgroup: fix use-after-free in cgwb_release_workfn() cgwb_release_workfn() calls css_put(wb->blkcg_css) and then later accesses wb->blkcg_css again via blkcg_unpin_online(). If css_put() drops the last reference, the blkcg can be freed asynchronously (css_free_rwork_fn -> blkcg_css_free -> kfree) before blkcg_unpin_online() dereferences the pointer to access blkcg->online_pin, resulting in a use-after-free: BUG: KASAN: slab-use-after-free in blkcg_unpin_online (./include/linux/instrumented.h:112 ./include/linux/atomic/atomic-instrumented.h:400 ./include/linux/refcount.h:389 ./include/linux/refcount.h:432 ./include/linux/refcount.h:450 block/blk-cgroup.c:1367) Write of size 4 at addr ff11000117aa6160 by task kworker/71:1/531 Workqueue: cgwb_release cgwb_release_workfn Call Trace: blkcg_unpin_online (./include/linux/instrumented.h:112 ./include/linux/atomic/atomic-instrumented.h:400 ./include/linux/refcount.h:389 ./include/linux/refcount.h:432 ./include/linux/refcount.h:450 block/blk-cgroup.c:1367) cgwb_release_workfn (mm/backing-dev.c:629) process_scheduled_works (kernel/workqueue.c:3278 kernel/workqueue.c:3385) Freed by task 1016: kfree (./include/linux/kasan.h:235 mm/slub.c:2689 mm/slub.c:6246 mm/slub.c:6561) css_free_rwork_fn (kernel/cgroup/cgroup.c:5542) process_scheduled_works (kernel/workqueue.c:3302 kernel/workqueue.c:3385) ** Stack based on commit 66672af7a095 ("Add linux-next specific files for 20260410") I am seeing this crash sporadically in Meta fleet across multiple kernel versions. A full reproducer is available at: https://github.com/leitao/debug/blob/main/reproducers/repro_blkcg_uaf.sh (The race window is narrow. To make it easily reproducible, inject a msleep(100) between css_put() and blkcg_unpin_online() in cgwb_release_workfn(). With that delay and a KASAN-enabled kernel, the reproducer triggers the splat reliably in less than a second.) Fix this by moving blkcg_unpin_online() before css_put(), so the cgwb's CSS reference keeps the blkcg alive while blkcg_unpin_online() accesses it. Link: https://lore.kernel.org/20260413-blkcg-v1-1-35b72622d16c@debian.org Fixes: 59b57717fff8 ("blkcg: delay blkg destruction until after writeback has finished") Signed-off-by: Breno Leitao Reviewed-by: Dennis Zhou Reviewed-by: Shakeel Butt Cc: David Hildenbrand Cc: Jens Axboe Cc: Johannes Weiner Cc: Josef Bacik Cc: JP Kobryn Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- mm/backing-dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7a18fa6c7272..cecbcf9060a6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -618,12 +618,13 @@ static void cgwb_release_workfn(struct work_struct *work) wb_shutdown(wb); css_put(wb->memcg_css); - css_put(wb->blkcg_css); - mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); + css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); + fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); From 615d9bb2ccad42f9e21d837431e401db2e471195 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 13 Apr 2026 19:43:11 +0100 Subject: [PATCH 5/7] mm: call ->free_folio() directly in folio_unmap_invalidate() We can only call filemap_free_folio() if we have a reference to (or hold a lock on) the mapping. Otherwise, we've already removed the folio from the mapping so it no longer pins the mapping and the mapping can be removed, causing a use-after-free when accessing mapping->a_ops. Follow the same pattern as __remove_mapping() and load the free_folio function pointer before dropping the lock on the mapping. That lets us make filemap_free_folio() static as this was the only caller outside filemap.c. Link: https://lore.kernel.org/20260413184314.3419945-1-willy@infradead.org Fixes: fb7d3bc41493 ("mm/filemap: drop streaming/uncached pages when writeback completes") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Google Big Sleep Cc: Jens Axboe Cc: Jan Kara Cc: Signed-off-by: Andrew Morton --- mm/filemap.c | 3 ++- mm/internal.h | 1 - mm/truncate.c | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3c1e785542dd..793bf4816ea3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -228,7 +228,8 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) page_cache_delete(mapping, folio, shadow); } -void filemap_free_folio(struct address_space *mapping, struct folio *folio) +static void filemap_free_folio(const struct address_space *mapping, + struct folio *folio) { void (*free_folio)(struct folio *); diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..546114d3ee44 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -540,7 +540,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); diff --git a/mm/truncate.c b/mm/truncate.c index 12467c1bd711..8617a12cb169 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -622,6 +622,7 @@ static int folio_launder(struct address_space *mapping, struct folio *folio) int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp) { + void (*free_folio)(struct folio *); int ret; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -648,9 +649,12 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); + free_folio = mapping->a_ops->free_folio; spin_unlock(&mapping->host->i_lock); - filemap_free_folio(mapping, folio); + if (free_folio) + free_folio(folio); + folio_put_refs(folio, folio_nr_pages(folio)); return 1; failed: xa_unlock_irq(&mapping->i_pages); From ec05f51f1e65bce95528543eb73fda56fd201d94 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 13 Apr 2026 21:26:46 +0200 Subject: [PATCH 6/7] mm/vmalloc: take vmap_purge_lock in shrinker decay_va_pool_node() can be invoked concurrently from two paths: __purge_vmap_area_lazy() when pools are being purged, and the shrinker via vmap_node_shrink_scan(). However, decay_va_pool_node() is not safe to run concurrently, and the shrinker path currently lacks serialization, leading to races and possible leaks. Protect decay_va_pool_node() by taking vmap_purge_lock in the shrinker path to ensure serialization with purge users. Link: https://lore.kernel.org/20260413192646.14683-1-urezki@gmail.com Fixes: 7679ba6b36db ("mm: vmalloc: add a shrinker to drain vmap pools") Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: chenyichong Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 61caa55a4402..676851d5cfe7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5416,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct vmap_node *vn; + guard(mutex)(&vmap_purge_lock); for_each_vmap_node(vn) decay_va_pool_node(vn, true); From 95093e5cb4c5b50a5b1a4b79f2942b62744bd66a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 11 Apr 2026 14:36:36 -0700 Subject: [PATCH 7/7] mm/damon/core: disallow non-power of two min_region_sz on damon_start() Commit d8f867fa0825 ("mm/damon: add damon_ctx->min_sz_region") introduced a bug that allows unaligned DAMON region address ranges. Commit c80f46ac228b ("mm/damon/core: disallow non-power of two min_region_sz") fixed it, but only for damon_commit_ctx() use case. Still, DAMON sysfs interface can emit non-power of two min_region_sz via damon_start(). Fix the path by adding the is_power_of_2() check on damon_start(). The issue was discovered by sashiko [1]. Link: https://lore.kernel.org/20260411213638.77768-1-sj@kernel.org Link: https://lore.kernel.org/20260403155530.64647-1-sj@kernel.org [1] Fixes: d8f867fa0825 ("mm/damon: add damon_ctx->min_sz_region") Signed-off-by: SeongJae Park Cc: # 6.18.x Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3703f62a876b..c107d74c77e7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1368,6 +1368,11 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) int i; int err = 0; + for (i = 0; i < nr_ctxs; i++) { + if (!is_power_of_2(ctxs[i]->min_region_sz)) + return -EINVAL; + } + mutex_lock(&damon_lock); if ((exclusive && nr_running_ctxs) || (!exclusive && running_exclusive_ctxs)) {