From d3722ff57eadbb49ce5d08504dad19ac8d8cee69 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 24 Oct 2025 19:06:52 +0200 Subject: [PATCH 01/42] slab: constify slab debug strings Since the string passed to slab_debug is never modified, use pointers to const char in all places where it is processed. No functional changes intended. Signed-off-by: Petr Tesarik Reviewed-by: Christoph Lameter Link: https://patch.msgid.link/819095b921f6ae03bb54fd69ee4020e2a3aef675.1761324765.git.ptesarik@suse.com Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d4367f25b20d..de74c0e9985e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -978,7 +978,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; static slab_flags_t slub_debug; #endif -static char *slub_debug_string; +static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* @@ -1785,8 +1785,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, * * returns the start of next block if there's any, or NULL */ -static char * -parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) +static const char * +parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) { bool higher_order_disable = false; @@ -1863,14 +1863,15 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) return NULL; } -static int __init setup_slub_debug(char *str) +static int __init setup_slub_debug(char *val) { slab_flags_t flags; slab_flags_t global_flags; - char *saved_str; - char *slab_list; + const char *saved_str; + const char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; + const char *str = val; global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) @@ -1935,9 +1936,9 @@ __setup_param("slub_debug", slub_debug, setup_slub_debug, 0); */ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { - char *iter; + const char *iter; size_t len; - char *next_block; + const char *next_block; slab_flags_t block_flags; slab_flags_t slub_debug_local = slub_debug; @@ -1961,7 +1962,7 @@ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) continue; /* Found a block that has a slab list, search it */ while (*iter) { - char *end, *glob; + const char *end, *glob; size_t cmplen; end = strchrnul(iter, ','); From aed760df8e8ebc2035561e53bef184e6a8240610 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 24 Oct 2025 19:06:53 +0200 Subject: [PATCH 02/42] slab: convert setup_slub_debug() to use __core_param_cb() Use __core_param_cb() to parse the "slab_debug" kernel parameter instead of the obsolescent __setup(). For now, the parameter is not exposed in sysfs, and no get ops is provided. There is a slight change in behavior. Before this patch, the following parameter would silently turn on full debugging for all slabs: slub_debug_yada_yada_gotta_love_this=hail_satan! This syntax is now rejected, and the parameter will be passed to user space, making the kernel a holier place. Signed-off-by: Petr Tesarik Link: https://patch.msgid.link/9674b34861394088c7853edf8e9d2b439fd4b42f.1761324765.git.ptesarik@suse.com Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index de74c0e9985e..e725b8d7199d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1863,7 +1863,7 @@ parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, return NULL; } -static int __init setup_slub_debug(char *val) +static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) { slab_flags_t flags; slab_flags_t global_flags; @@ -1871,10 +1871,9 @@ static int __init setup_slub_debug(char *val) const char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - const char *str = val; global_flags = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) + if (!str || !*str) /* * No options specified. Switch on full debugging. */ @@ -1918,11 +1917,15 @@ static int __init setup_slub_debug(char *val) static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); - return 1; + return 0; } -__setup("slab_debug", setup_slub_debug); -__setup_param("slub_debug", slub_debug, setup_slub_debug, 0); +static const struct kernel_param_ops param_ops_slab_debug __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slub_debug, +}; +__core_param_cb(slab_debug, ¶m_ops_slab_debug, NULL, 0); +__core_param_cb(slub_debug, ¶m_ops_slab_debug, NULL, 0); /* * kmem_cache_flags - apply debugging options to the cache From 8ad018dbd344c0cdc5f31c4fab56593f85eede02 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 24 Oct 2025 19:06:54 +0200 Subject: [PATCH 03/42] slab: use new API for remaining command line parameters Use core_param() and __core_param_cb() instead of __setup() or __setup_param() to improve syntax checking and error messages. Replace get_option() with kstrtouint(), because: * the latter accepts a pointer to const char, * these parameters should not accept ranges, * error value can be passed directly to parser. There is one more change apart from the parsing of numeric parameters: slab_strict_numa parameter name must match exactly. Before this patch the kernel would silently accept any option that starts with the name as an undocumented alias. Signed-off-by: Petr Tesarik Link: https://patch.msgid.link/6ae7e0ddc72b7619203c07dd5103a598e12f713b.1761324765.git.ptesarik@suse.com Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 57 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e725b8d7199d..3095f10e0fe4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -8131,46 +8131,53 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) * Kmalloc subsystem *******************************************************************/ -static int __init setup_slub_min_order(char *str) +static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_min_order); + int ret; + + ret = kstrtouint(str, 0, &slub_min_order); + if (ret) + return ret; if (slub_min_order > slub_max_order) slub_max_order = slub_min_order; - return 1; + return 0; } -__setup("slab_min_order=", setup_slub_min_order); -__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); +static const struct kernel_param_ops param_ops_slab_min_order __initconst = { + .set = setup_slub_min_order, +}; +__core_param_cb(slab_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); +__core_param_cb(slub_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); - -static int __init setup_slub_max_order(char *str) +static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_max_order); + int ret; + + ret = kstrtouint(str, 0, &slub_max_order); + if (ret) + return ret; + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); if (slub_min_order > slub_max_order) slub_min_order = slub_max_order; - return 1; + return 0; } -__setup("slab_max_order=", setup_slub_max_order); -__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); +static const struct kernel_param_ops param_ops_slab_max_order __initconst = { + .set = setup_slub_max_order, +}; +__core_param_cb(slab_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); +__core_param_cb(slub_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); -static int __init setup_slub_min_objects(char *str) -{ - get_option(&str, (int *)&slub_min_objects); - - return 1; -} - -__setup("slab_min_objects=", setup_slub_min_objects); -__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +core_param(slab_min_objects, slub_min_objects, uint, 0); +core_param(slub_min_objects, slub_min_objects, uint, 0); #ifdef CONFIG_NUMA -static int __init setup_slab_strict_numa(char *str) +static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) { if (nr_node_ids > 1) { static_branch_enable(&strict_numa); @@ -8179,10 +8186,14 @@ static int __init setup_slab_strict_numa(char *str) pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); } - return 1; + return 0; } -__setup("slab_strict_numa", setup_slab_strict_numa); +static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slab_strict_numa, +}; +__core_param_cb(slab_strict_numa, ¶m_ops_slab_strict_numa, NULL, 0); #endif From ea6b5e5778b1dc58b1909e4badd3e180ddae7418 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:30 +0100 Subject: [PATCH 04/42] slab: move kfence_alloc() out of internal bulk alloc SLUB's internal bulk allocation __kmem_cache_alloc_bulk() can currently allocate some objects from KFENCE, i.e. when refilling a sheaf. It works but it's conceptually the wrong layer, as KFENCE allocations should only happen when objects are actually handed out from slab to its users. Currently for sheaf-enabled caches, slab_alloc_node() can return KFENCE object via kfence_alloc(), but also via alloc_from_pcs() when a sheaf was refilled with KFENCE objects. Continuing like this would also complicate the upcoming sheaf refill changes. Thus remove KFENCE allocation from __kmem_cache_alloc_bulk() and move it to the places that return slab objects to users. slab_alloc_node() is already covered (see above). Add kfence_alloc() to kmem_cache_alloc_from_sheaf() to handle KFENCE allocations from prefilled sheafs, with a comment that the caller should not expect the sheaf size to decrease after every allocation because of this possibility. For kmem_cache_alloc_bulk() implement a different strategy to handle KFENCE upfront and rely on internal batched operations afterwards. Assume there will be at most once KFENCE allocation per bulk allocation and then assign its index in the array of objects randomly. Cc: Alexander Potapenko Cc: Marco Elver Cc: Dmitry Vyukov Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-2-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 074abe8e79f8..0237a329d4e5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5540,6 +5540,9 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, * * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT * memcg charging is forced over limit if necessary, to avoid failure. + * + * It is possible that the allocation comes from kfence and then the sheaf + * size is not decreased. */ void * kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, @@ -5551,7 +5554,10 @@ kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, if (sheaf->size == 0) goto out; - ret = sheaf->objects[--sheaf->size]; + ret = kfence_alloc(s, s->object_size, gfp); + + if (likely(!ret)) + ret = sheaf->objects[--sheaf->size]; init = slab_want_init_on_alloc(gfp, s); @@ -7399,14 +7405,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_lock_irqsave(&s->cpu_slab->lock, irqflags); for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); + void *object = c->freelist; - if (unlikely(object)) { - p[i] = object; - continue; - } - - object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -7487,6 +7487,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { unsigned int i = 0; + void *kfence_obj; if (!size) return 0; @@ -7495,6 +7496,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; + /* + * to make things simpler, only assume at most once kfence allocated + * object per bulk allocation and choose its index randomly + */ + kfence_obj = kfence_alloc(s, s->object_size, flags); + + if (unlikely(kfence_obj)) { + if (unlikely(size == 1)) { + p[0] = kfence_obj; + goto out; + } + size--; + } + if (s->cpu_sheaves) i = alloc_from_pcs_bulk(s, size, p); @@ -7506,10 +7521,23 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { if (i > 0) __kmem_cache_free_bulk(s, i, p); + if (kfence_obj) + __kfence_free(kfence_obj); return 0; } } + if (unlikely(kfence_obj)) { + int idx = get_random_u32_below(size + 1); + + if (idx != size) + p[size] = p[idx]; + p[idx] = kfence_obj; + + size++; + } + +out: /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. From 1ce20c28eafdc101164a4bfedd2ea818eb137de7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:31 +0100 Subject: [PATCH 05/42] slab: handle pfmemalloc slabs properly with sheaves When a pfmemalloc allocation actually dips into reserves, the slab is marked accordingly and non-pfmemalloc allocations should not be allowed to allocate from it. The sheaves percpu caching currently doesn't follow this rule, so implement it before we expand sheaves usage to all caches. Make sure objects from pfmemalloc slabs don't end up in percpu sheaves. When freeing, skip sheaves when freeing an object from pfmemalloc slab. When refilling sheaves, use __GFP_NOMEMALLOC to override any pfmemalloc context - the allocation will fallback to regular slab allocations when sheaves are depleted and can't be refilled because of the override. For kfree_rcu(), detect pfmemalloc slabs after processing the rcu_sheaf after the grace period in __rcu_free_sheaf_prepare() and simply flush it if any object is from pfmemalloc slabs. For prefilled sheaves, try to refill them first with __GFP_NOMEMALLOC and if it fails, retry without __GFP_NOMEMALLOC but then mark the sheaf pfmemalloc, which makes it flushed back to slabs when returned. Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-3-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 69 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0237a329d4e5..bb744e8044f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -469,7 +469,10 @@ struct slab_sheaf { struct rcu_head rcu_head; struct list_head barn_list; /* only used for prefilled sheafs */ - unsigned int capacity; + struct { + unsigned int capacity; + bool pfmemalloc; + }; }; struct kmem_cache *cache; unsigned int size; @@ -2651,7 +2654,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) if (!sheaf) return NULL; - if (refill_sheaf(s, sheaf, gfp)) { + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { free_empty_sheaf(s, sheaf); return NULL; } @@ -2729,12 +2732,13 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) sheaf->size = 0; } -static void __rcu_free_sheaf_prepare(struct kmem_cache *s, +static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, struct slab_sheaf *sheaf) { bool init = slab_want_init_on_free(s); void **p = &sheaf->objects[0]; unsigned int i = 0; + bool pfmemalloc = false; while (i < sheaf->size) { struct slab *slab = virt_to_slab(p[i]); @@ -2747,8 +2751,13 @@ static void __rcu_free_sheaf_prepare(struct kmem_cache *s, continue; } + if (slab_test_pfmemalloc(slab)) + pfmemalloc = true; + i++; } + + return pfmemalloc; } static void rcu_free_sheaf_nobarn(struct rcu_head *head) @@ -5041,7 +5050,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; if (empty) { - if (!refill_sheaf(s, empty, gfp)) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { full = empty; } else { /* @@ -5341,6 +5350,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, + struct slab_sheaf *sheaf, gfp_t gfp) +{ + int ret = 0; + + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); + + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) + return ret; + + /* + * if we are allowed to, refill sheaf with pfmemalloc but then remember + * it for when it's returned + */ + ret = refill_sheaf(s, sheaf, gfp); + sheaf->pfmemalloc = true; + + return ret; +} + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5375,6 +5404,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) sheaf->cache = s; sheaf->capacity = size; + /* + * we do not need to care about pfmemalloc here because oversize + * sheaves area always flushed and freed when returned + */ if (!__kmem_cache_alloc_bulk(s, gfp, size, &sheaf->objects[0])) { kfree(sheaf); @@ -5411,17 +5444,18 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) if (!sheaf) sheaf = alloc_empty_sheaf(s, gfp); - if (sheaf && sheaf->size < size) { - if (refill_sheaf(s, sheaf, gfp)) { + if (sheaf) { + sheaf->capacity = s->sheaf_capacity; + sheaf->pfmemalloc = false; + + if (sheaf->size < size && + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { sheaf_flush_unused(s, sheaf); free_empty_sheaf(s, sheaf); sheaf = NULL; } } - if (sheaf) - sheaf->capacity = s->sheaf_capacity; - return sheaf; } @@ -5441,7 +5475,8 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, struct slub_percpu_sheaves *pcs; struct node_barn *barn; - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + if (unlikely((sheaf->capacity != s->sheaf_capacity) + || sheaf->pfmemalloc)) { sheaf_flush_unused(s, sheaf); kfree(sheaf); return; @@ -5507,7 +5542,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, if (likely(sheaf->capacity >= size)) { if (likely(sheaf->capacity == s->sheaf_capacity)) - return refill_sheaf(s, sheaf, gfp); + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, &sheaf->objects[sheaf->size])) { @@ -6215,8 +6250,12 @@ static void rcu_free_sheaf(struct rcu_head *head) * handles it fine. The only downside is that sheaf will serve fewer * allocations when reused. It only happens due to debugging, which is a * performance hit anyway. + * + * If it returns true, there was at least one object from pfmemalloc + * slab so simply flush everything. */ - __rcu_free_sheaf_prepare(s, sheaf); + if (__rcu_free_sheaf_prepare(s, sheaf)) + goto flush; n = get_node(s, sheaf->node); if (!n) @@ -6371,7 +6410,8 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) continue; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) + || slab_test_pfmemalloc(slab))) { remote_objects[remote_nr] = p[i]; p[i] = p[--size]; if (++remote_nr >= PCS_BATCH_MAX) @@ -6669,7 +6709,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, return; if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) { + slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { if (likely(free_to_pcs(s, object))) return; } From 31e0886fd57d426d18a239dd55e176032c9c1cb0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:32 +0100 Subject: [PATCH 06/42] slub: remove CONFIG_SLUB_TINY specific code paths CONFIG_SLUB_TINY minimizes the SLUB's memory overhead in multiple ways, mainly by avoiding percpu caching of slabs and objects. It also reduces code size by replacing some code paths with simplified ones through ifdefs, but the benefits of that are smaller and would complicate the upcoming changes. Thus remove these code paths and associated ifdefs and simplify the code base. Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-4-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab.h | 2 - mm/slub.c | 107 ++---------------------------------------------------- 2 files changed, 4 insertions(+), 105 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..f7b8df56727d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -236,10 +236,8 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { -#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; struct lock_class_key lock_key; -#endif struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; diff --git a/mm/slub.c b/mm/slub.c index bb744e8044f0..a7c6d79154f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -410,7 +410,6 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -#ifndef CONFIG_SLUB_TINY /* * When changing the layout, make sure freelist and tid are still compatible * with this_cpu_cmpxchg_double() alignment requirements. @@ -432,7 +431,6 @@ struct kmem_cache_cpu { unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif }; -#endif /* CONFIG_SLUB_TINY */ static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -597,12 +595,10 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } -#endif /* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@ -714,10 +710,12 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) return s->cpu_partial_slabs; } #else +#ifdef SLAB_SUPPORTS_SYSFS static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } +#endif static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) { @@ -2026,13 +2024,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } -#endif #endif /* CONFIG_SLUB_DEBUG */ #ifdef CONFIG_SLAB_OBJ_EXT @@ -3623,8 +3619,6 @@ static struct slab *get_partial(struct kmem_cache *s, int node, return get_any_partial(s, pc); } -#ifndef CONFIG_SLUB_TINY - #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -4024,12 +4018,6 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) return c->slab || slub_percpu_partial(c); } -#else /* CONFIG_SLUB_TINY */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } -static inline void flush_this_cpu_slab(struct kmem_cache *s) { } -#endif /* CONFIG_SLUB_TINY */ - static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; @@ -4370,7 +4358,6 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -#ifndef CONFIG_SLUB_TINY static inline bool __update_cpu_freelist_fast(struct kmem_cache *s, void *freelist_old, void *freelist_new, @@ -4634,7 +4621,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = pc.object; /* * For debug caches here we had to go through @@ -4672,7 +4659,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLAB); - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) { @@ -4884,32 +4871,6 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s, return object; } -#else /* CONFIG_SLUB_TINY */ -static void *__slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) -{ - struct partial_context pc; - struct slab *slab; - void *object; - - pc.flags = gfpflags; - pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - - if (slab) - return pc.object; - - slab = new_slab(s, gfpflags, node); - if (unlikely(!slab)) { - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - return object; -} -#endif /* CONFIG_SLUB_TINY */ /* * If the object has been wiped upon free, make sure it's fully initialized by @@ -5760,9 +5721,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) * it did local_lock_irqsave(&s->cpu_slab->lock, flags). * In this case fast path with __update_cpu_freelist_fast() is not safe. */ -#ifndef CONFIG_SLUB_TINY if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) -#endif ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); if (PTR_ERR(ret) == -EBUSY) { @@ -6553,14 +6512,10 @@ static void free_deferred_objects(struct irq_work *work) llist_for_each_safe(pos, t, llnode) { struct slab *slab = container_of(pos, struct slab, llnode); -#ifdef CONFIG_SLUB_TINY - free_slab(slab->slab_cache, slab); -#else if (slab->frozen) deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); else free_slab(slab->slab_cache, slab); -#endif } } @@ -6596,7 +6551,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -6689,14 +6643,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s, } stat_add(s, FREE_FASTPATH, cnt); } -#else /* CONFIG_SLUB_TINY */ -static void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - __slab_free(s, slab, head, tail, cnt, addr); -} -#endif /* CONFIG_SLUB_TINY */ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, @@ -6974,11 +6920,7 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); -#ifndef CONFIG_SLUB_TINY do_slab_free(s, slab, x, x, 0, _RET_IP_); -#else - defer_free(s, x); -#endif } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -7428,7 +7370,6 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -#ifndef CONFIG_SLUB_TINY static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) @@ -7493,35 +7434,6 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, return 0; } -#else /* CONFIG_SLUB_TINY */ -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - int i; - - for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } - - p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, - _RET_IP_, s->object_size); - if (unlikely(!p[i])) - goto error; - - maybe_wipe_obj_freeptr(s, p[i]); - } - - return i; - -error: - __kmem_cache_free_bulk(s, i, p); - return 0; -} -#endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, @@ -7740,7 +7652,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < @@ -7761,12 +7672,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } -#else -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) -{ - return 1; -} -#endif /* CONFIG_SLUB_TINY */ static int init_percpu_sheaves(struct kmem_cache *s) { @@ -7856,13 +7761,11 @@ void __kmem_cache_release(struct kmem_cache *s) cache_random_seq_destroy(s); if (s->cpu_sheaves) pcs_destroy(s); -#ifndef CONFIG_SLUB_TINY #ifdef CONFIG_PREEMPT_RT if (s->cpu_slab) lockdep_unregister_key(&s->lock_key); #endif free_percpu(s->cpu_slab); -#endif free_kmem_cache_nodes(s); } @@ -8605,10 +8508,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { -#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); -#endif } struct kmem_cache * From f6087b926aea65768975fd4cbc3775965cbd8621 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:29 +0100 Subject: [PATCH 07/42] slab: make __slab_free() more clear The function is tricky and many of its tests are hard to understand. Try to improve that by using more descriptively named variables and added comments. - rename 'prior' to 'old_head' to match the head and tail parameters - introduce a 'bool was_full' to make it more obvious what we are testing instead of the !prior and prior tests - add or improve comments in various places to explain what we're doing Also replace kmem_cache_has_cpu_partial() tests with IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) which are compile-time constants. We can do that because the kmem_cache_debug(s) case is handled upfront via free_to_partial_list(). Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-1-b8218e1ac7ef@suse.cz Signed-off-by: Vlastimil Babka --- mm/slub.c | 62 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f1a5373eee7b..074abe8e79f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5859,8 +5859,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - void *prior; - int was_frozen; + void *old_head; + bool was_frozen, was_full; struct slab new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -5874,20 +5874,37 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) + * is the only other reason it can be false, and it is already handled + * above. + */ + do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = slab->freelist; + old_head = slab->freelist; counters = slab->counters; - set_freepointer(s, tail, prior); + set_freepointer(s, tail, old_head); new.counters = counters; - was_frozen = new.frozen; + was_frozen = !!new.frozen; + was_full = (old_head == NULL); new.inuse -= cnt; - if ((!new.inuse || !prior) && !was_frozen) { - /* Needs to be taken off a list */ - if (!kmem_cache_has_cpu_partial(s) || prior) { + /* + * Might need to be taken off (due to becoming empty) or added + * to (due to not being full anymore) the partial list. + * Unless it's frozen. + */ + if ((!new.inuse || was_full) && !was_frozen) { + /* + * If slab becomes non-full and we have cpu partial + * lists, we put it there unconditionally to avoid + * taking the list_lock. Otherwise we need it. + */ + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { n = get_node(s, slab_nid(slab)); /* @@ -5905,7 +5922,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } } while (!slab_update_freelist(s, slab, - prior, counters, + old_head, counters, head, new.counters, "__slab_free")); @@ -5917,7 +5934,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (kmem_cache_has_cpu_partial(s) && !prior) { + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { /* * If we started with a full slab then put it onto the * per cpu partial list. @@ -5926,6 +5943,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, stat(s, CPU_PARTIAL_FREE); } + /* + * In other cases we didn't take the list_lock because the slab + * was already on the partial list and will remain there. + */ + return; } @@ -5933,19 +5955,24 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * This slab was partially empty but not on the per-node partial list, * in which case we shouldn't manipulate its list, just return. */ - if (prior && !on_node_partial) { + if (!was_full && !on_node_partial) { spin_unlock_irqrestore(&n->list_lock, flags); return; } + /* + * If slab became empty, should we add/keep it on the partial list or we + * have enough? + */ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before - * then add it. + * then add it. This can only happen when cache has no per cpu partial + * list otherwise we would have put it there. */ - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -5953,10 +5980,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; slab_empty: - if (prior) { - /* - * Slab on the partial list. - */ + /* + * The slab could have a single object and thus go from full to empty in + * a single free, but more likely it was on the partial list. Remove it. + */ + if (likely(!was_full)) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } From 4c0a17e28340e458627d672564200406e220d6a3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:33 +0100 Subject: [PATCH 08/42] slab: prevent recursive kmalloc() in alloc_empty_sheaf() We want to expand usage of sheaves to all non-boot caches, including kmalloc caches. Since sheaves themselves are also allocated by kmalloc(), we need to prevent excessive or infinite recursion - depending on sheaf size, the sheaf can be allocated from smaller, same or larger kmalloc size bucket, there's no particular constraint. This is similar to allocating the objext arrays so let's just reuse the existing mechanisms for those. __GFP_NO_OBJ_EXT in alloc_empty_sheaf() will prevent a nested kmalloc() from allocating a sheaf itself - it will either have sheaves already, or fallback to a non-sheaf-cached allocation (so bootstrap of sheaves in a kmalloc cache that allocates sheaves from its own size bucket is possible). Additionally, reuse OBJCGS_CLEAR_MASK to clear unwanted gfp flags from the nested allocation. Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-5-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp_types.h | 6 ------ mm/slub.c | 36 ++++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 65db9349f905..3de43b12209e 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -55,9 +55,7 @@ enum { #ifdef CONFIG_LOCKDEP ___GFP_NOLOCKDEP_BIT, #endif -#ifdef CONFIG_SLAB_OBJ_EXT ___GFP_NO_OBJ_EXT_BIT, -#endif ___GFP_LAST_BIT }; @@ -98,11 +96,7 @@ enum { #else #define ___GFP_NOLOCKDEP 0 #endif -#ifdef CONFIG_SLAB_OBJ_EXT #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) -#else -#define ___GFP_NO_OBJ_EXT 0 -#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) diff --git a/mm/slub.c b/mm/slub.c index a7c6d79154f8..f729c208965b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2031,6 +2031,14 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, } #endif /* CONFIG_SLUB_DEBUG */ +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + #ifdef CONFIG_SLAB_OBJ_EXT #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG @@ -2081,14 +2089,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) - static inline void init_slab_obj_exts(struct slab *slab) { slab->obj_exts = 0; @@ -2596,8 +2596,24 @@ static void *setup_object(struct kmem_cache *s, void *object) static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) { - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, - s->sheaf_capacity), gfp); + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) return NULL; From b244358e9a1cd61276b8785b1b4275f1f45a1dc2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Nov 2025 14:51:23 +0100 Subject: [PATCH 09/42] slab: separate struct freelist_tid from kmem_cache_cpu In kmem_cache_cpu we currently have a union of the freelist+tid pair with freelist_aba_t, relying implicitly on the type compatibility with the freelist+counters pair used in freelist_aba_t. To allow further changes to freelist_aba_t, we can instead define a separate struct freelist_tid (instead of a typedef, per the coding style) for kmem_cache_cpu, as that affects only a single helper __update_cpu_freelist_fast(). We can add the resulting struct freelist_tid to kmem_cache_cpu as unnamed field thanks to -fms-extensions, so that freelist and tid fields can still be accessed directly. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 074abe8e79f8..5f6408c9e0fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -411,18 +411,22 @@ enum stat_item { }; #ifndef CONFIG_SLUB_TINY +struct freelist_tid { + union { + struct { + void *freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ + }; + freelist_full_t freelist_tid; + }; +}; + /* * When changing the layout, make sure freelist and tid are still compatible * with this_cpu_cmpxchg_double() alignment requirements. */ struct kmem_cache_cpu { - union { - struct { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_aba_t freelist_tid; - }; + struct freelist_tid; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ @@ -4367,11 +4371,11 @@ __update_cpu_freelist_fast(struct kmem_cache *s, void *freelist_old, void *freelist_new, unsigned long tid) { - freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; - freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, - &old.full, new.full); + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, + &old.freelist_tid, new.freelist_tid); } /* From 3993ca9d6495e1e4d6fdaffc1bba0271059940c4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Nov 2025 14:51:24 +0100 Subject: [PATCH 10/42] slab: turn freelist_aba_t to a struct and fully define counters there In struct slab we currently have freelist and counters pair, where counters itself is a union of unsigned long with a sub-struct of several smaller fields. Then for the usage with double cmpxchg we have freelist_aba_t that duplicates the definition of the freelist+counters with implicitly the same layout as the full definition in struct slab. Thanks to -fms-extension we can now move the full counters definition to freelist_aba_t (while changing it to struct freelist_counters as a typedef is unnecessary and discouraged) and replace the relevant part in struct slab to an unnamed reference to it. The immediate benefit is the removal of duplication and no longer relying on the same layout implicitly. It also allows further cleanups thanks to having the full definition of counters in struct freelist_counters. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab.h | 52 ++++++++++++++++++++++++---------------------------- mm/slub.c | 8 +++++--- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..42627b87d50c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -40,13 +40,29 @@ typedef u64 freelist_full_t; * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ -typedef union { - struct { - void *freelist; - unsigned long counter; +struct freelist_counters { + union { + struct { + void *freelist; + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ + unsigned frozen:1; + }; + }; + }; +#ifdef system_has_freelist_aba + freelist_full_t freelist_counters; +#endif }; - freelist_full_t full; -} freelist_aba_t; +}; /* Reuses the bits in struct page */ struct slab { @@ -69,27 +85,7 @@ struct slab { #endif }; /* Double-word boundary */ - union { - struct { - void *freelist; /* first free object */ - union { - unsigned long counters; - struct { - unsigned inuse:16; - unsigned objects:15; - /* - * If slab debugging is enabled then the - * frozen bit can be reused to indicate - * that the slab was corrupted - */ - unsigned frozen:1; - }; - }; - }; -#ifdef system_has_freelist_aba - freelist_aba_t freelist_counter; -#endif - }; + struct freelist_counters; }; struct rcu_head rcu_head; }; @@ -114,7 +110,7 @@ SLAB_MATCH(_unused_slab_obj_exts, obj_exts); #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) -static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); #endif /** diff --git a/mm/slub.c b/mm/slub.c index 5f6408c9e0fd..8330e4f8b3b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -764,10 +764,12 @@ __update_freelist_fast(struct slab *slab, void *freelist_new, unsigned long counters_new) { #ifdef system_has_freelist_aba - freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; - freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; + struct freelist_counters old = { .freelist = freelist_old, .counters = counters_old }; + struct freelist_counters new = { .freelist = freelist_new, .counters = counters_new }; - return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); + return try_cmpxchg_freelist(&slab->freelist_counters, + &old.freelist_counters, + new.freelist_counters); #else return false; #endif From 32cf9f21828a752a364b2698ec66f8532cd66c52 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Nov 2025 14:51:25 +0100 Subject: [PATCH 11/42] slab: use struct freelist_counters for local variables instead of struct slab In several functions we declare local struct slab variables so we can work with the freelist and counters fields (including the sub-counters that are in the union) comfortably. With struct freelist_counters containing the full counters definition, we can now reduce the local variables to that type as we don't need the other fields in struct slab. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 8330e4f8b3b2..a55e0af26ec7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3725,8 +3725,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; - struct slab new; - struct slab old; + struct freelist_counters old, new; if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); @@ -4390,7 +4389,7 @@ __update_cpu_freelist_fast(struct kmem_cache *s, */ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct slab new; + struct freelist_counters new; unsigned long counters; void *freelist; @@ -4418,7 +4417,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) */ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) { - struct slab new; + struct freelist_counters new; unsigned long counters; void *freelist; @@ -5867,7 +5866,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, { void *old_head; bool was_frozen, was_full; - struct slab new; + struct freelist_counters new; unsigned long counters; struct kmem_cache_node *n = NULL; unsigned long flags; From c33196c9429a1db5bc6cded27b6286f341ad6be0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Nov 2025 14:51:26 +0100 Subject: [PATCH 12/42] slab: use struct freelist_counters as parameters in relevant functions In functions such as [__]slab_update_freelist() and __slab_update_freelist_fast/slow() we pass old and new freelist and counters as 4 separate parameters. The underlying __update_freelist_fast() then constructs struct freelist_counters variables for passing the full freelist+counter combinations to cmpxchg double. In most cases we actually start with struct freelist_counters variables, but then pass the individual fields, only to construct new struct freelist_counters variables. While it's all inlined and thus should be efficient, we can simplify this code. Thus replace the 4 parameters for individual fields with two pointers to struct freelist_counters wherever applicable. __update_freelist_fast() can then pass them directly to try_cmpxchg_freelist(). The code is also more obvious as the pattern becomes unified such that we set up "old" and "new" struct freelist_counters variables upfront as we fully need them to be, and simply call [__]slab_update_freelist() on them. Previously some of the "new" values would be hidden among the many parameters and thus make it harder to figure out what the code does. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 126 ++++++++++++++++++++++-------------------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a55e0af26ec7..ddd71f4937fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -759,34 +759,29 @@ static __always_inline void slab_unlock(struct slab *slab) } static inline bool -__update_freelist_fast(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_fast(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { #ifdef system_has_freelist_aba - struct freelist_counters old = { .freelist = freelist_old, .counters = counters_old }; - struct freelist_counters new = { .freelist = freelist_new, .counters = counters_new }; - return try_cmpxchg_freelist(&slab->freelist_counters, - &old.freelist_counters, - new.freelist_counters); + &old->freelist_counters, + new->freelist_counters); #else return false; #endif } static inline bool -__update_freelist_slow(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_slow(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { bool ret = false; slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; + if (slab->freelist == old->freelist && + slab->counters == old->counters) { + slab->freelist = new->freelist; + slab->counters = new->counters; ret = true; } slab_unlock(slab); @@ -802,22 +797,18 @@ __update_freelist_slow(struct slab *slab, * interrupt the operation. */ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); - if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); - } else { - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); - } + if (s->flags & __CMPXCHG_DOUBLE) + ret = __update_freelist_fast(slab, old, new); + else + ret = __update_freelist_slow(slab, old, new); + if (likely(ret)) return true; @@ -832,21 +823,17 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla } static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_fast(slab, old, new); } else { unsigned long flags; local_irq_save(flags); - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_slow(slab, old, new); local_irq_restore(flags); } if (likely(ret)) @@ -3774,10 +3761,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } else { new.freelist = old.freelist; } - } while (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); /* * Stage three: Manipulate the slab list based on the updated state. @@ -4389,27 +4373,24 @@ __update_cpu_freelist_fast(struct kmem_cache *s, */ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct freelist_counters new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; + new.freelist = NULL; + new.counters = old.counters; - new.inuse = slab->objects; - new.frozen = freelist != NULL; + new.inuse = old.objects; + new.frozen = old.freelist != NULL; - } while (!__slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "get_freelist")); - return freelist; + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + + return old.freelist; } /* @@ -4417,26 +4398,22 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) */ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) { - struct freelist_counters new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; + new.freelist = NULL; + new.counters = old.counters; VM_BUG_ON(new.frozen); - new.inuse = slab->objects; + new.inuse = old.objects; new.frozen = 1; - } while (!slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "freeze_slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - return freelist; + return old.freelist; } /* @@ -5864,10 +5841,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - void *old_head; bool was_frozen, was_full; - struct freelist_counters new; - unsigned long counters; + struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; @@ -5891,13 +5866,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - old_head = slab->freelist; - counters = slab->counters; - set_freepointer(s, tail, old_head); - new.counters = counters; - was_frozen = !!new.frozen; - was_full = (old_head == NULL); + + old.freelist = slab->freelist; + old.counters = slab->counters; + + was_full = (old.freelist == NULL); + was_frozen = old.frozen; + + set_freepointer(s, tail, old.freelist); + + new.freelist = head; + new.counters = old.counters; new.inuse -= cnt; + /* * Might need to be taken off (due to becoming empty) or added * to (due to not being full anymore) the partial list. @@ -5926,10 +5907,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } } - } while (!slab_update_freelist(s, slab, - old_head, counters, - head, new.counters, - "__slab_free")); + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { From 6adf4b11fa50a31dc1c3791131020e624c6f139d Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Tue, 11 Nov 2025 20:22:05 +0800 Subject: [PATCH 13/42] mm: simplify list initialization in barn_shrink() In barn_shrink(), use LIST_HEAD() to declare and initialize the list_head in one step instead of using INIT_LIST_HEAD() separately. No functional change. Signed-off-by: Baolin Liu Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f729c208965b..72eeeefd0a89 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3032,14 +3032,11 @@ static void barn_init(struct node_barn *barn) static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) { - struct list_head empty_list; - struct list_head full_list; + LIST_HEAD(empty_list); + LIST_HEAD(full_list); struct slab_sheaf *sheaf, *sheaf2; unsigned long flags; - INIT_LIST_HEAD(&empty_list); - INIT_LIST_HEAD(&full_list); - spin_lock_irqsave(&barn->lock, flags); list_splice_init(&barn->sheaves_full, &full_list); From 2bcd3800f2da1be13b972858f63c66d035b1ec6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:15 +0000 Subject: [PATCH 14/42] slab: Reimplement page_slab() In order to separate slabs from folios, we need to convert from any page in a slab to the slab directly without going through a page to folio conversion first. Up to this point, page_slab() has followed the example of other memdesc converters (page_folio(), page_ptdesc() etc) and just cast the pointer to the requested type, regardless of whether the pointer is actually a pointer to the correct type or not. That changes with this commit; we check that the page actually belongs to a slab and return NULL if it does not. Other memdesc converters will adopt this convention in future. kfence was the only user of page_slab(), so adjust it to the new way of working. It will need to be touched again when we separate slab from page. Signed-off-by: Matthew Wilcox (Oracle) Cc: Alexander Potapenko Cc: Marco Elver Cc: kasan-dev@googlegroups.com Link: https://patch.msgid.link/20251113000932.1589073-2-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Tested-by: Marco Elver Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 14 +------------- mm/kfence/core.c | 14 ++++++++------ mm/slab.h | 28 ++++++++++++++++------------ 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0091ad1986bf..6d5e44968eab 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) -FOLIO_TYPE_OPS(slab, slab) - -/** - * PageSlab - Determine if the page belongs to the slab allocator - * @page: The page to test. - * - * Context: Any context. - * Return: True for slab pages, false for any other kind of page. - */ -static inline bool PageSlab(const struct page *page) -{ - return folio_test_slab(page_folio(page)); -} +PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 727c20c94ac5..e62b5516bf48 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); - __folio_set_slab(slab_folio(slab)); + page = pfn_to_page(start_pfn + i); + __SetPageSlab(page); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); + page = pfn_to_page(start_pfn + i); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = 0; #endif - __folio_clear_slab(slab_folio(slab)); + __ClearPageSlab(page); } return addr; diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..a64b9b2c8731 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -146,20 +146,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) struct slab *: (struct folio *)s)) /** - * page_slab - Converts from first struct page to slab. - * @p: The first (either head of compound or single) page of slab. + * page_slab - Converts from struct page to its slab. + * @page: A page which may or may not belong to a slab. * - * A temporary wrapper to convert struct page to struct slab in situations where - * we know the page is the compound head, or single order-0 page. - * - * Long-term ideally everything would work with struct slab directly or go - * through folio to struct slab. - * - * Return: The slab which contains this page + * Return: The slab which contains this page or NULL if the page does + * not belong to a slab. This includes pages returned from large kmalloc. */ -#define page_slab(p) (_Generic((p), \ - const struct page *: (const struct slab *)(p), \ - struct page *: (struct slab *)(p))) +static inline struct slab *page_slab(const struct page *page) +{ + unsigned long head; + + head = READ_ONCE(page->compound_head); + if (head & 1) + page = (struct page *)(head - 1); + if (data_race(page->page_type >> 24) != PGTY_slab) + page = NULL; + + return (struct slab *)page; +} /** * slab_page - The first struct page allocated for a slab From ee1ee8abc4197e21594ca29348629ccbfff4daec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:16 +0000 Subject: [PATCH 15/42] slab: Remove folio references from __ksize() In the future, we will separate slab, folio and page from each other and calling virt_to_folio() on an address allocated from slab will return NULL. Delay the conversion from struct page to struct slab until we know we're not dealing with a large kmalloc allocation. There's a minor win for large kmalloc allocations as we avoid the compound_head() hidden in virt_to_folio(). This deprecates calling ksize() on memory allocated by alloc_pages(). Today it becomes a warning and support will be removed entirely in the future. Introduce large_kmalloc_size() to abstract how we represent the size of a large kmalloc allocation. For now, this is the same as page_size(), but it will change with separately allocated memdescs. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-3-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 2 +- mm/slab.h | 10 ++++++++++ mm/slab_common.c | 23 ++++++++++++----------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6d5e44968eab..f7a0e4af0c73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1064,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) -FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) +PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/mm/slab.h b/mm/slab.h index a64b9b2c8731..31ccf0f6d3a1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -605,6 +605,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } +static inline unsigned int large_kmalloc_order(const struct page *page) +{ + return page[1].flags.f & 0xff; +} + +static inline size_t large_kmalloc_size(const struct page *page) +{ + return PAGE_SIZE << large_kmalloc_order(page); +} + #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index 932d13ada36c..67ad2328276e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void) */ size_t __ksize(const void *object) { - struct folio *folio; + const struct page *page; + const struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return 0; - folio = virt_to_folio(object); + page = virt_to_page(object); - if (unlikely(!folio_test_slab(folio))) { - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) - return 0; - if (WARN_ON(object != folio_address(folio))) - return 0; - return folio_size(folio); - } + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); #ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(folio_slab(folio)->slab_cache, object); + skip_orig_size_check(slab->slab_cache, object); #endif - return slab_ksize(folio_slab(folio)->slab_cache); + return slab_ksize(slab->slab_cache); } gfp_t kmalloc_fix_flags(gfp_t flags) From ea4702b1708ee3df8da06f07ce41fea84e6ed81d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:17 +0000 Subject: [PATCH 16/42] slab: Remove folio references in memcg_slab_post_charge() This allows us to skip the compound_head() call for large kmalloc objects as the virt_to_page() call will always give us the head page for the large kmalloc case. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251113000932.1589073-4-willy@infradead.org Signed-off-by: Vlastimil Babka --- mm/slub.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d4367f25b20d..a7c0662f89c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2369,33 +2369,34 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) { struct slabobj_ext *slab_exts; struct kmem_cache *s; - struct folio *folio; + struct page *page; struct slab *slab; unsigned long off; - folio = virt_to_folio(p); - if (!folio_test_slab(folio)) { + page = virt_to_page(p); + if (PageLargeKmalloc(page)) { + unsigned int order; int size; - if (folio_memcg_kmem(folio)) + if (PageMemcgKmem(page)) return true; - if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, - folio_order(folio))) + order = large_kmalloc_order(page); + if (__memcg_kmem_charge_page(page, flags, order)) return false; /* - * This folio has already been accounted in the global stats but + * This page has already been accounted in the global stats but * not in the memcg stats. So, subtract from the global and use * the interface which adds to both global and memcg stats. */ - size = folio_size(folio); - node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + size = PAGE_SIZE << order; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); return true; } - slab = folio_slab(folio); + slab = page_slab(page); s = slab->slab_cache; /* From 09fa19e2f3a512310bf4287546fc0f2b10a63e5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:18 +0000 Subject: [PATCH 17/42] slab: Remove folio references in slab alloc/free Use pages directly to further the split between slab and folio. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251113000932.1589073-5-willy@infradead.org Signed-off-by: Vlastimil Babka --- mm/slub.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a7c0662f89c6..112222eacdcb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3045,24 +3045,24 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct kmem_cache_order_objects oo, bool allow_spin) { - struct folio *folio; + struct page *page; struct slab *slab; unsigned int order = oo_order(oo); if (unlikely(!allow_spin)) - folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, node, order); else if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages(flags, order); + page = alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); + page = __alloc_frozen_pages(flags, order, node, NULL); - if (!folio) + if (!page) return NULL; - slab = folio_slab(folio); - __folio_set_slab(folio); - if (folio_is_pfmemalloc(folio)) + __SetPageSlab(page); + slab = page_slab(page); + if (page_is_pfmemalloc(page)) slab_set_pfmemalloc(slab); return slab; @@ -3286,16 +3286,16 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) static void __free_slab(struct kmem_cache *s, struct slab *slab) { - struct folio *folio = slab_folio(slab); - int order = folio_order(folio); + struct page *page = slab_page(slab); + int order = compound_order(page); int pages = 1 << order; __slab_clear_pfmemalloc(slab); - folio->mapping = NULL; - __folio_clear_slab(folio); + page->mapping = NULL; + __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(&folio->page, order); + free_frozen_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) From f9395bf5db450ccbf58eb737c227485df6aab26c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:19 +0000 Subject: [PATCH 18/42] slab: Remove folio references from ___kmalloc_large_node() There's no need to use folio APIs here; just use a page directly. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251113000932.1589073-6-willy@infradead.org Signed-off-by: Vlastimil Babka --- mm/slub.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 112222eacdcb..458e58e37480 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5571,7 +5571,7 @@ unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) */ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct folio *folio; + struct page *page; void *ptr = NULL; unsigned int order = get_order(size); @@ -5581,15 +5581,15 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) flags |= __GFP_COMP; if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); + page = alloc_frozen_pages_noprof(flags, order); else - folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); - if (folio) { - ptr = folio_address(folio); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); - __folio_set_large_kmalloc(folio); + __SetPageLargeKmalloc(page); } ptr = kasan_kmalloc_large(ptr, size, flags); From 0bdfdd6a05aa51fa66bae15af79dba977eeaffe9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:20 +0000 Subject: [PATCH 19/42] slab: Remove folio references from free_large_kmalloc() There's no need to use folio APIs here; just use a page directly. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251113000932.1589073-7-willy@infradead.org Signed-off-by: Vlastimil Babka --- mm/slub.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 458e58e37480..79b71ee47f63 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6742,12 +6742,12 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -static void free_large_kmalloc(struct folio *folio, void *object) +static void free_large_kmalloc(struct page *page, void *object) { - unsigned int order = folio_order(folio); + unsigned int order = compound_order(page); - if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { - dump_page(&folio->page, "Not a kmalloc allocation"); + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { + dump_page(page, "Not a kmalloc allocation"); return; } @@ -6758,10 +6758,10 @@ static void free_large_kmalloc(struct folio *folio, void *object) kasan_kfree_large(object); kmsan_kfree_large(object); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __folio_clear_large_kmalloc(folio); - free_frozen_pages(&folio->page, order); + __ClearPageLargeKmalloc(page); + free_frozen_pages(page, order); } /* @@ -6789,7 +6789,7 @@ void kvfree_rcu_cb(struct rcu_head *head) * consider folio order */ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); - free_large_kmalloc(folio, obj); + free_large_kmalloc(&folio->page, obj); return; } @@ -6829,7 +6829,7 @@ void kfree(const void *object) folio = virt_to_folio(object); if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, (void *)object); + free_large_kmalloc(&folio->page, (void *)object); return; } @@ -7253,7 +7253,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); + free_large_kmalloc(&folio->page, object); df->slab = NULL; return size; } From f262cfd75d52eb285d696d0c7357dc853d7bc7ea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:21 +0000 Subject: [PATCH 20/42] slab: Remove folio references from kvfree_rcu_cb() Remove conversions from folio to page and folio to slab. This is preparation for separately allocated struct slab from struct page. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-8-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 79b71ee47f63..56c7ddff43fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6771,7 +6771,7 @@ static void free_large_kmalloc(struct page *page, void *object) void kvfree_rcu_cb(struct rcu_head *head) { void *obj = head; - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *slab_addr; @@ -6782,20 +6782,20 @@ void kvfree_rcu_cb(struct rcu_head *head) return; } - folio = virt_to_folio(obj); - if (!folio_test_slab(folio)) { + page = virt_to_page(obj); + slab = page_slab(page); + if (!slab) { /* * rcu_head offset can be only less than page size so no need to - * consider folio order + * consider allocation order */ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); - free_large_kmalloc(&folio->page, obj); + free_large_kmalloc(page, obj); return; } - slab = folio_slab(folio); s = slab->slab_cache; - slab_addr = folio_address(folio); + slab_addr = slab_address(slab); if (is_kfence_address(obj)) { obj = kfence_object_start(obj); From e4090216859054043f8ba50866a2fb9c8e6d6b5b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:22 +0000 Subject: [PATCH 21/42] slab: Remove folio references from kfree() This should generate identical code to the previous version, but without any dependency on how folios work. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-9-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 56c7ddff43fa..0386994c0cb7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6817,7 +6817,7 @@ void kvfree_rcu_cb(struct rcu_head *head) */ void kfree(const void *object) { - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6827,13 +6827,13 @@ void kfree(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(&folio->page, (void *)object); + page = virt_to_page(object); + slab = page_slab(page); + if (!slab) { + free_large_kmalloc(page, (void *)object); return; } - slab = folio_slab(folio); s = slab->slab_cache; slab_free(s, slab, x, _RET_IP_); } From 5db009dc10b16056ed340a488e948855def63fca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:23 +0000 Subject: [PATCH 22/42] slab: Remove folio references from __do_krealloc() One slight tweak I made is to calculate 'ks' earlier, which means we can reuse it in the warning rather than calculating the object size twice. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-10-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0386994c0cb7..d26ebae2a759 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6932,16 +6932,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { - struct folio *folio; + struct page *page = virt_to_page(p); + struct slab *slab = page_slab(page); - folio = virt_to_folio(p); - if (unlikely(!folio_test_slab(folio))) { + if (!slab) { /* Big kmalloc object */ - WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); - WARN_ON(p != folio_address(folio)); - ks = folio_size(folio); + ks = page_size(page); + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != page_address(page)); } else { - s = folio_slab(folio)->slab_cache; + s = slab->slab_cache; orig_size = get_orig_size(s, (void *)p); ks = s->object_size; } From 4a2c2110a343b7c8762982c355ba34acf563b08a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:24 +0000 Subject: [PATCH 23/42] slab: Remove folio references from build_detached_freelist() Use pages and slabs directly instead of converting to folios. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-11-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d26ebae2a759..3ada9421c65d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7245,23 +7245,25 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, { int lookahead = 3; void *object; - struct folio *folio; + struct page *page; + struct slab *slab; size_t same; object = p[--size]; - folio = virt_to_folio(object); + page = virt_to_page(object); + slab = page_slab(page); if (!s) { /* Handle kalloc'ed objects */ - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(&folio->page, object); + if (!slab) { + free_large_kmalloc(page, object); df->slab = NULL; return size; } /* Derive kmem_cache from object */ - df->slab = folio_slab(folio); - df->s = df->slab->slab_cache; + df->slab = slab; + df->s = slab->slab_cache; } else { - df->slab = folio_slab(folio); + df->slab = slab; df->s = cache_from_obj(s, object); /* Support for memcg */ } From 7d26842fd43cb0f7e29c9f8e98af9091ccb0aef5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:25 +0000 Subject: [PATCH 24/42] slab: Remove folio references from kfree_rcu_sheaf() In preparation for splitting struct slab from struct page and struct folio, remove mentions of struct folio from this function. Since we don't need to handle large kmalloc objects specially here, we can just use virt_to_slab(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-12-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 67ad2328276e..84dfff4f7b1f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1615,17 +1615,15 @@ static void kfree_rcu_work(struct work_struct *work) static bool kfree_rcu_sheaf(void *obj) { struct kmem_cache *s; - struct folio *folio; struct slab *slab; if (is_vmalloc_addr(obj)) return false; - folio = virt_to_folio(obj); - if (unlikely(!folio_test_slab(folio))) + slab = virt_to_slab(obj); + if (unlikely(!slab)) return false; - slab = folio_slab(folio); s = slab->slab_cache; if (s->cpu_sheaves) { if (likely(!IS_ENABLED(CONFIG_NUMA) || From 025f5b870b2c4f30cbf452c5b07f9ab249cf73ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:26 +0000 Subject: [PATCH 25/42] slab: Remove folio references from kfree_nolock() In preparation for splitting struct slab from struct page and struct folio, remove mentions of struct folio from this function. Since large kmalloc objects are not supported here, we can just use virt_to_slab(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-13-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3ada9421c65d..8e5da7b6efe4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6850,7 +6850,6 @@ EXPORT_SYMBOL(kfree); */ void kfree_nolock(const void *object) { - struct folio *folio; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6858,13 +6857,12 @@ void kfree_nolock(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { + slab = virt_to_slab(object); + if (unlikely(!slab)) { WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); return; } - slab = folio_slab(folio); s = slab->slab_cache; memcg_slab_free_hook(s, slab, &x, 1); From 5934b1be8dbe67fa728eff0e68cbafb958c55aa5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:27 +0000 Subject: [PATCH 26/42] usercopy: Remove folio references from check_heap_object() Use page_slab() instead of virt_to_folio() followed by folio_slab(). We do end up calling compound_head() twice for non-slab copies, but that will not be a problem once we allocate memdescs separately. Signed-off-by: Matthew Wilcox (Oracle) Cc: Kees Cook Cc: Gustavo A. R. Silva Cc: linux-hardening@vger.kernel.org Link: https://patch.msgid.link/20251113000932.1589073-14-willy@infradead.org Reviewed-by: Harry Yoo Reviewed-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/usercopy.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index dbdcc43964fb..5de7a518b1b1 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -164,7 +164,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, { unsigned long addr = (unsigned long)ptr; unsigned long offset; - struct folio *folio; + struct page *page; + struct slab *slab; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); @@ -189,16 +190,23 @@ static inline void check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return; - folio = virt_to_folio(ptr); - - if (folio_test_slab(folio)) { + page = virt_to_page(ptr); + slab = page_slab(page); + if (slab) { /* Check slab allocator for flags and size. */ - __check_heap_object(ptr, n, folio_slab(folio), to_user); - } else if (folio_test_large(folio)) { - offset = ptr - folio_address(folio); - if (n > folio_size(folio) - offset) + __check_heap_object(ptr, n, slab, to_user); + } else if (PageCompound(page)) { + page = compound_head(page); + offset = ptr - page_address(page); + if (n > page_size(page) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } + + /* + * We cannot check non-compound pages. They might be part of + * a large allocation, in which case crossing a page boundary + * is fine. + */ } DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, From 0f2620ffc41d117cc28bc053efe2dc837cf748dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:42 +0100 Subject: [PATCH 27/42] fault-inject: make enum fault_flags available unconditionally This will allow using should_fail_ex from code without having to make it conditional on CONFIG_FAULT_INJECTION. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-2-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/fault-inject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 8c829d28dcf3..58fd14c82270 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -8,6 +8,10 @@ struct dentry; struct kmem_cache; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #ifdef CONFIG_FAULT_INJECTION #include @@ -36,10 +40,6 @@ struct fault_attr { struct dentry *dname; }; -enum fault_flags { - FAULT_NOWARN = 1 << 0, -}; - #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ From e9939cebc0be8dabb1798b357e9dadf6398fa859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:43 +0100 Subject: [PATCH 28/42] mm: improve kerneldoc comments for __alloc_pages_bulk Describe the semantincs in more detail, as the filling empty slots in an array scheme is not quite obvious. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-3-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/page_alloc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..b3d37169a553 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4982,13 +4982,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * @nr_pages: The number of pages desired in the array * @page_array: Array to store the pages * - * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to the page_array. + * This is a batched version of the page allocator that attempts to allocate + * @nr_pages quickly. Pages are added to @page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. + * Note that only the elements in @page_array that were cleared to %NULL on + * entry are populated with newly allocated pages. @nr_pages is the maximum + * number of pages that will be stored in the array. * - * Returns the number of pages in the array. + * Returns the number of pages in @page_array, including ones already + * allocated on entry. This can be less than the number requested in @nr_pages, + * but all empty slots are filled from the beginning. I.e., if all slots in + * @page_array were set to %NULL on entry, the slots from 0 to the return value + * - 1 will be filled. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, From 5c829783e5f8dbb7ca6fce50c5c4a33f7c75d0d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:44 +0100 Subject: [PATCH 29/42] mempool: improve kerneldoc comments Use proper formatting, use full sentences and reduce some verbosity in function parameter descriptions. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-4-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 1c38e873e546..1f4701713203 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -372,18 +372,20 @@ int mempool_resize(mempool_t *pool, int new_min_nr) EXPORT_SYMBOL(mempool_resize); /** - * mempool_alloc - allocate an element from a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * @gfp_mask: the usual allocation bitmask. + * mempool_alloc - allocate an element from a memory pool + * @pool: pointer to the memory pool + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. * - * this function only sleeps if the alloc_fn() function sleeps or - * returns NULL. Note that due to preallocation, this function - * *never* fails when called from process contexts. (it might - * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Allocate an element from @pool. This is done by first calling into the + * alloc_fn supplied at pool initialization time, and dipping into the reserved + * pool when alloc_fn fails to allocate an element. * - * Return: pointer to the allocated element or %NULL on error. + * This function only sleeps if the alloc_fn callback sleeps, or when waiting + * for elements to become available in the pool. + * + * Return: pointer to the allocated element or %NULL when failing to allocate + * an element. Allocation failure can only happen when @gfp_mask does not + * include %__GFP_DIRECT_RECLAIM. */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { @@ -456,11 +458,10 @@ EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements - * belonging to a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * belonging to a memory pool + * @pool: pointer to the memory pool * - * This function is similar to mempool_alloc, but it only attempts allocating + * This function is similar to mempool_alloc(), but it only attempts allocating * an element from the preallocated elements. It does not sleep and immediately * returns if no preallocated elements are available. * @@ -492,12 +493,14 @@ void *mempool_alloc_preallocated(mempool_t *pool) EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to the pool. - * @element: pool element pointer. - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_free - return an element to a mempool + * @element: pointer to element + * @pool: pointer to the memory pool * - * this function only sleeps if the free_fn() function sleeps. + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. */ void mempool_free(void *element, mempool_t *pool) { From b77fc08e393b77883bcb71825cfd49e44da44022 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:45 +0100 Subject: [PATCH 30/42] mempool: add error injection support Add a call to should_fail_ex that forces mempool to actually allocate from the pool to stress the mempool implementation when enabled through debugfs. By default should_fail{,_ex} prints a very verbose stack trace that clutters the kernel log, slows down execution and triggers the kernel bug detection in xfstests. Pass FAULT_NOWARN and print a single-line message notating the caller instead so that full tests can be run with fault injection. Signed-off-by: Christoph Hellwig Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20251113084022.1255121-5-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 1f4701713203..5cf59779cc3d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -9,7 +9,7 @@ * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ - +#include #include #include #include @@ -20,6 +20,15 @@ #include #include "slab.h" +static DECLARE_FAULT_ATTR(fail_mempool_alloc); + +static int __init mempool_faul_inject_init(void) +{ + return PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + NULL, &fail_mempool_alloc)); +} +late_initcall(mempool_faul_inject_init); + #ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) @@ -404,9 +413,15 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + element = NULL; + } else { + element = pool->alloc(gfp_temp, pool->pool_data); + } - element = pool->alloc(gfp_temp, pool->pool_data); - if (likely(element != NULL)) + if (likely(element)) return element; spin_lock_irqsave(&pool->lock, flags); From 3d2492401d3cdb8e9e1276c3af5f1cd0c8a2b076 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:46 +0100 Subject: [PATCH 31/42] mempool: factor out a mempool_adjust_gfp helper Add a helper to better isolate and document the gfp flags adjustments. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-6-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 5cf59779cc3d..a0718a35c34f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -380,6 +380,19 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); +/* + * Adjust the gfp flags for mempool allocations, as we never want to dip into + * the global emergency reserves or retry in the page allocator. + * + * The first pass also doesn't want to go reclaim, but the next passes do, so + * return a separate subset for that first iteration. + */ +static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) +{ + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); +} + /** * mempool_alloc - allocate an element from a memory pool * @pool: pointer to the memory pool @@ -398,20 +411,14 @@ EXPORT_SYMBOL(mempool_resize); */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; unsigned long flags; wait_queue_entry_t wait; - gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ - gfp_mask |= __GFP_NOWARN; /* failures are OK */ - - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); - repeat_alloc: if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { pr_info("forcing mempool usage for %pS\n", From b8557d109e7de6962ad4fe217b93316f4e659130 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:28 +0000 Subject: [PATCH 32/42] memcg: Convert mem_cgroup_from_obj_folio() to mem_cgroup_from_obj_slab() In preparation for splitting struct slab from struct page and struct folio, convert the pointer to a slab rather than a folio. This means we can end up passing a NULL slab pointer to mem_cgroup_from_obj_slab() if the pointer is not to a page allocated to slab, and we handle that appropriately by returning NULL. Signed-off-by: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Cc: cgroups@vger.kernel.org Link: https://patch.msgid.link/20251113000932.1589073-15-willy@infradead.org Acked-by: Johannes Weiner Signed-off-by: Vlastimil Babka --- mm/memcontrol.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f4..b46356da6c0e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2557,38 +2557,25 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, } static __always_inline -struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) +struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * slab->obj_exts. */ - if (folio_test_slab(folio)) { - struct slabobj_ext *obj_exts; - struct slab *slab; - unsigned int off; - - slab = folio_slab(folio); - obj_exts = slab_obj_exts(slab); - if (!obj_exts) - return NULL; - - off = obj_to_index(slab->slab_cache, slab, p); - if (obj_exts[off].objcg) - return obj_cgroup_memcg(obj_exts[off].objcg); + struct slabobj_ext *obj_exts; + unsigned int off; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; - } - /* - * folio_memcg_check() is used here, because in theory we can encounter - * a folio where the slab flag has been cleared already, but - * slab->obj_exts has not been freed yet - * folio_memcg_check() will guarantee that a proper memory - * cgroup pointer or NULL will be returned. - */ - return folio_memcg_check(folio); + off = obj_to_index(slab->slab_cache, slab, p); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); + + return NULL; } /* @@ -2602,10 +2589,15 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) */ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { + struct slab *slab; + if (mem_cgroup_disabled()) return NULL; - return mem_cgroup_from_obj_folio(virt_to_folio(p), p); + slab = virt_to_slab(p); + if (slab) + return mem_cgroup_from_obj_slab(slab, p); + return folio_memcg_check(virt_to_folio(p)); } static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) From bbe711730515f688a0bf4ab76a2639bcede933f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:29 +0000 Subject: [PATCH 33/42] kasan: Remove references to folio in __kasan_mempool_poison_object() In preparation for splitting struct slab from struct page and struct folio, remove mentions of struct folio from this function. There is a mild improvement for large kmalloc objects as we will avoid calling compound_head() for them. We can discard the comment as using PageLargeKmalloc() rather than !folio_test_slab() makes it obvious. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: kasan-dev Link: https://patch.msgid.link/20251113000932.1589073-16-willy@infradead.org Acked-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/kasan/common.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..38e8bb0bf326 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -520,24 +520,20 @@ void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { - struct folio *folio = virt_to_folio(ptr); + struct page *page = virt_to_page(ptr); struct slab *slab; - /* - * This function can be called for large kmalloc allocation that get - * their memory from page_alloc. Thus, the folio might not be a slab. - */ - if (unlikely(!folio_test_slab(folio))) { + if (unlikely(PageLargeKmalloc(page))) { if (check_page_allocation(ptr, ip)) return false; - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); return true; } if (is_kfence_address(ptr)) return true; - slab = folio_slab(folio); + slab = page_slab(page); if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; From 76ade2443397ef7612c978f92858d525e5b2eeab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:30 +0000 Subject: [PATCH 34/42] slab: Remove references to folios from virt_to_slab() Use page_slab() instead of virt_to_folio() which will work perfectly when struct slab is separated from struct folio. This was the last user of folio_slab(), so delete it. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-17-willy@infradead.org Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 31ccf0f6d3a1..6e3e80c90043 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -117,19 +117,6 @@ static_assert(sizeof(struct slab) <= sizeof(struct page)); static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif -/** - * folio_slab - Converts from folio to slab. - * @folio: The folio. - * - * Currently struct slab is a different representation of a folio where - * folio_test_slab() is true. - * - * Return: The slab which contains this folio. - */ -#define folio_slab(folio) (_Generic((folio), \ - const struct folio *: (const struct slab *)(folio), \ - struct folio *: (struct slab *)(folio))) - /** * slab_folio - The folio allocated for a slab * @s: The slab. @@ -192,12 +179,7 @@ static inline pg_data_t *slab_pgdat(const struct slab *slab) static inline struct slab *virt_to_slab(const void *addr) { - struct folio *folio = virt_to_folio(addr); - - if (!folio_test_slab(folio)) - return NULL; - - return folio_slab(folio); + return page_slab(virt_to_page(addr)); } static inline int slab_order(const struct slab *slab) From 1742d97df628de55c0df1a0eb6eefb27136ee890 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:47 +0100 Subject: [PATCH 35/42] mempool: factor out a mempool_alloc_from_pool helper Add a helper for the mempool_alloc slowpath to better separate it from the fast path, and also use it to implement mempool_alloc_preallocated which shares the same logic. [hughd@google.com: fix lack of retrying with __GFP_DIRECT_RECLAIM] [vbabka@suse.cz: really use limited flags for first mempool attempt] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-7-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 126 +++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 64 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index a0718a35c34f..6bcc319d547d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -380,6 +380,50 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); +static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) +{ + unsigned long flags; + void *element; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(!pool->curr_nr)) + goto fail; + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + + /* Paired with rmb in mempool_free(), read comment there. */ + smp_wmb(); + + /* + * Update the allocation stack trace as this is more useful for + * debugging. + */ + kmemleak_update_trace(element); + return element; + +fail: + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + DEFINE_WAIT(wait); + + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * Wait for someone else to return an element to @pool. + * + * FIXME: this should be io_schedule(). The timeout is there as + * a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5 * HZ); + finish_wait(&pool->wait, &wait); + } else { + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ + spin_unlock_irqrestore(&pool->lock, flags); + } + + return NULL; +} + /* * Adjust the gfp flags for mempool allocations, as we never want to dip into * the global emergency reserves or retry in the page allocator. @@ -413,8 +457,6 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; - unsigned long flags; - wait_queue_entry_t wait; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); @@ -428,53 +470,27 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) element = pool->alloc(gfp_temp, pool->pool_data); } - if (likely(element)) - return element; - - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + if (unlikely(!element)) { /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Try to allocate an element from the pool. + * + * The first pass won't have __GFP_DIRECT_RECLAIM and won't + * sleep in mempool_alloc_from_pool. Retry the allocation + * with all flags set in that case. */ - kmemleak_update_trace(element); - return element; + element = mempool_alloc_from_pool(pool, gfp_temp); + if (!element) { + if (gfp_temp != gfp_mask) { + gfp_temp = gfp_mask; + goto repeat_alloc; + } + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + goto repeat_alloc; + } + } } - /* - * We use gfp mask w/o direct reclaim or IO for the first round. If - * alloc failed with that and @pool was empty, retry immediately. - */ - if (gfp_temp != gfp_mask) { - spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask; - goto repeat_alloc; - } - - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; - } - - /* Let's wait for someone else to return an element to @pool */ - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - - spin_unlock_irqrestore(&pool->lock, flags); - - /* - * FIXME: this should be io_schedule(). The timeout is there as a - * workaround for some DM problems in 2.6.18. - */ - io_schedule_timeout(5*HZ); - - finish_wait(&pool->wait, &wait); - goto repeat_alloc; + return element; } EXPORT_SYMBOL(mempool_alloc_noprof); @@ -492,25 +508,7 @@ EXPORT_SYMBOL(mempool_alloc_noprof); */ void *mempool_alloc_preallocated(mempool_t *pool) { - void *element; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); - /* - * Update the allocation stack trace as this is more useful - * for debugging. - */ - kmemleak_update_trace(element); - return element; - } - spin_unlock_irqrestore(&pool->lock, flags); - - return NULL; + return mempool_alloc_from_pool(pool, GFP_NOWAIT); } EXPORT_SYMBOL(mempool_alloc_preallocated); From ac529d86ad26d632d3c70b7c5b839282a3294d2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:48 +0100 Subject: [PATCH 36/42] mempool: add mempool_{alloc,free}_bulk Add a version of the mempool allocator that works for batch allocations of multiple objects. Calling mempool_alloc in a loop is not safe because it could deadlock if multiple threads are performing such an allocation at the same time. As an extra benefit the interface is build so that the same array can be used for alloc_pages_bulk / release_pages so that at least for page backed mempools the fast path can use a nice batch optimization. Note that mempool_alloc_bulk does not take a gfp_mask argument as it must always be able to sleep and doesn't support any non-trivial modifiers. NOFO or NOIO constrainst must be set through the scoped API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-8-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 6 ++ mm/mempool.c | 177 ++++++++++++++++++++++++++++++---------- 2 files changed, 141 insertions(+), 42 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 34941a4b9026..e914fec0e119 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -66,9 +66,15 @@ extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, + unsigned int count, unsigned int allocated); +#define mempool_alloc_bulk(...) \ + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); +unsigned int mempool_free_bulk(struct mempool *pool, void **elem, + unsigned int count); /* * A mempool_alloc_t and mempool_free_t that get the memory from diff --git a/mm/mempool.c b/mm/mempool.c index 6bcc319d547d..b45bcf415147 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -21,11 +21,21 @@ #include "slab.h" static DECLARE_FAULT_ATTR(fail_mempool_alloc); +static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); static int __init mempool_faul_inject_init(void) { - return PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + int error; + + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", NULL, &fail_mempool_alloc)); + if (error) + return error; + + /* booting will fail on error return here, don't bother to cleanup */ + return PTR_ERR_OR_ZERO( + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, + &fail_mempool_alloc_bulk)); } late_initcall(mempool_faul_inject_init); @@ -380,15 +390,22 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); -static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) +static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated, + gfp_t gfp_mask) { unsigned long flags; - void *element; + unsigned int i; spin_lock_irqsave(&pool->lock, flags); - if (unlikely(!pool->curr_nr)) + if (unlikely(pool->curr_nr < count - allocated)) goto fail; - element = remove_element(pool); + for (i = 0; i < count; i++) { + if (!elems[i]) { + elems[i] = remove_element(pool); + allocated++; + } + } spin_unlock_irqrestore(&pool->lock, flags); /* Paired with rmb in mempool_free(), read comment there. */ @@ -398,8 +415,9 @@ static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) * Update the allocation stack trace as this is more useful for * debugging. */ - kmemleak_update_trace(element); - return element; + for (i = 0; i < count; i++) + kmemleak_update_trace(elems[i]); + return allocated; fail: if (gfp_mask & __GFP_DIRECT_RECLAIM) { @@ -421,7 +439,7 @@ static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); } - return NULL; + return allocated; } /* @@ -437,6 +455,65 @@ static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); } +/** + * mempool_alloc_bulk - allocate multiple elements from a memory pool + * @pool: pointer to the memory pool + * @elems: partially or fully populated elements array + * @count: number of entries in @elem that need to be allocated + * @allocated: number of entries in @elem already allocated + * + * Allocate elements for each slot in @elem that is non-%NULL. This is done by + * first calling into the alloc_fn supplied at pool initialization time, and + * dipping into the reserved pool when alloc_fn fails to allocate an element. + * + * On return all @count elements in @elems will be populated. + * + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. + */ +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated) +{ + gfp_t gfp_mask = GFP_KERNEL; + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); + unsigned int i = 0; + + VM_WARN_ON_ONCE(count > pool->min_nr); + might_alloc(gfp_mask); + + /* + * If an error is injected, fail all elements in a bulk allocation so + * that we stress the multiple elements missing path. + */ + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + goto use_pool; + } + +repeat_alloc: + /* + * Try to allocate the elements using the allocation callback first as + * that might succeed even when the caller's bulk allocation did not. + */ + for (i = 0; i < count; i++) { + if (elems[i]) + continue; + elems[i] = pool->alloc(gfp_temp, pool->pool_data); + if (unlikely(!elems[i])) + goto use_pool; + allocated++; + } + + return 0; + +use_pool: + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, + gfp_temp); + gfp_temp = gfp_mask; + goto repeat_alloc; +} +EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); + /** * mempool_alloc - allocate an element from a memory pool * @pool: pointer to the memory pool @@ -478,8 +555,7 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) * sleep in mempool_alloc_from_pool. Retry the allocation * with all flags set in that case. */ - element = mempool_alloc_from_pool(pool, gfp_temp); - if (!element) { + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { if (gfp_temp != gfp_mask) { gfp_temp = gfp_mask; goto repeat_alloc; @@ -508,26 +584,33 @@ EXPORT_SYMBOL(mempool_alloc_noprof); */ void *mempool_alloc_preallocated(mempool_t *pool) { - return mempool_alloc_from_pool(pool, GFP_NOWAIT); + void *element = NULL; + + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); + return element; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to a mempool - * @element: pointer to element + * mempool_free_bulk - return elements to a mempool * @pool: pointer to the memory pool + * @elems: elements to return + * @count: number of elements to return * - * Returns @element to @pool if it needs replenishing, else frees it using - * the free_fn callback in @pool. + * Returns a number of elements from the start of @elem to @pool if @pool needs + * replenishing and sets their slots in @elem to NULL. Other elements are left + * in @elem. * - * This function only sleeps if the free_fn callback sleeps. + * Return: number of elements transferred to @pool. Elements are always + * transferred from the beginning of @elem, so the return value can be used as + * an offset into @elem for the freeing the remaining elements in the caller. */ -void mempool_free(void *element, mempool_t *pool) +unsigned int mempool_free_bulk(struct mempool *pool, void **elems, + unsigned int count) { unsigned long flags; - - if (unlikely(element == NULL)) - return; + unsigned int freed = 0; + bool added = false; /* * Paired with the wmb in mempool_alloc(). The preceding read is @@ -561,21 +644,6 @@ void mempool_free(void *element, mempool_t *pool) * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. - */ - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr < pool->min_nr)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; - } - spin_unlock_irqrestore(&pool->lock, flags); - } - - /* - * Handle the min_nr = 0 edge case: * * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, * so waiters sleeping on pool->wait would never be woken by the @@ -583,20 +651,45 @@ void mempool_free(void *element, mempool_t *pool) * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. */ - if (unlikely(pool->min_nr == 0 && + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + while (pool->curr_nr < pool->min_nr && freed < count) { + add_element(pool, elems[freed++]); + added = true; + } + spin_unlock_irqrestore(&pool->lock, flags); + } else if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { + /* Handle the min_nr = 0 edge case: */ spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; + add_element(pool, elems[freed++]); + added = true; } spin_unlock_irqrestore(&pool->lock, flags); } - pool->free(element, pool->pool_data); + if (unlikely(added) && wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + + return freed; +} +EXPORT_SYMBOL_GPL(mempool_free_bulk); + +/** + * mempool_free - return an element to the pool. + * @element: element to return + * @pool: pointer to the memory pool + * + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. + */ +void mempool_free(void *element, struct mempool *pool) +{ + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); From 9c4391767f31d4114da577ab87437f28c1171d6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:49 +0100 Subject: [PATCH 37/42] mempool: legitimize the io_schedule_timeout in mempool_alloc_from_pool The timeout here is and old workaround with a Fixme comment. But thinking about it, it makes sense to keep it, so reword the comment. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-9-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index b45bcf415147..9ec3a04a0130 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -427,10 +427,10 @@ static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, spin_unlock_irqrestore(&pool->lock, flags); /* - * Wait for someone else to return an element to @pool. - * - * FIXME: this should be io_schedule(). The timeout is there as - * a workaround for some DM problems in 2.6.18. + * Wait for someone else to return an element to @pool, but wake + * up occasionally as memory pressure might have reduced even + * and the normal allocation in alloc_fn could succeed even if + * no element was returned. */ io_schedule_timeout(5 * HZ); finish_wait(&pool->wait, &wait); From 8b41fb80a2cc023591f47d63b094e96af9c2c615 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:50 +0100 Subject: [PATCH 38/42] mempool: remove mempool_{init,create}_kvmalloc_pool This was added for bcachefs and is unused now. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-10-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 13 ------------- mm/mempool.c | 13 ------------- 2 files changed, 26 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index e914fec0e119..d9332485e8ca 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -103,19 +103,6 @@ void mempool_kfree(void *element, void *pool_data); mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ (void *)(unsigned long)(_size)) -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); -void mempool_kvfree(void *element, void *pool_data); - -static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - -static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/mm/mempool.c b/mm/mempool.c index 9ec3a04a0130..0e1e015998e7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -728,19 +728,6 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kvmalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kvmalloc); - -void mempool_kvfree(void *element, void *pool_data) -{ - kvfree(element); -} -EXPORT_SYMBOL(mempool_kvfree); - /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. From 0cab6873b7305abdd0acd95ee8cfa56b983500da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:51 +0100 Subject: [PATCH 39/42] mempool: de-typedef Switch all uses of the deprecated mempool_t typedef in the core mempool code to use struct mempool instead. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-11-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 39 ++++++++++++++++---------------- mm/mempool.c | 50 +++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index d9332485e8ca..e8e440e04a06 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -27,32 +27,31 @@ typedef struct mempool { wait_queue_head_t wait; } mempool_t; -static inline bool mempool_initialized(mempool_t *pool) +static inline bool mempool_initialized(struct mempool *pool) { return pool->elements != NULL; } -static inline bool mempool_is_saturated(mempool_t *pool) +static inline bool mempool_is_saturated(struct mempool *pool) { return READ_ONCE(pool->curr_nr) >= pool->min_nr; } -void mempool_exit(mempool_t *pool); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id); - -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); +void mempool_exit(struct mempool *pool); +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id); +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data); #define mempool_init(...) \ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) -extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); - -extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int nid); +struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int nid); #define mempool_create_node(...) \ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) @@ -60,10 +59,10 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ GFP_KERNEL, NUMA_NO_NODE) -extern int mempool_resize(mempool_t *pool, int new_min_nr); -extern void mempool_destroy(mempool_t *pool); +int mempool_resize(struct mempool *pool, int new_min_nr); +void mempool_destroy(struct mempool *pool); -extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, @@ -71,8 +70,8 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, #define mempool_alloc_bulk(...) \ alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) -extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; -extern void mempool_free(void *element, mempool_t *pool); +void *mempool_alloc_preallocated(struct mempool *pool) __malloc; +void mempool_free(void *element, struct mempool *pool); unsigned int mempool_free_bulk(struct mempool *pool, void **elem, unsigned int count); diff --git a/mm/mempool.c b/mm/mempool.c index 0e1e015998e7..89ab7bba5c9c 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -40,7 +40,7 @@ static int __init mempool_faul_inject_init(void) late_initcall(mempool_faul_inject_init); #ifdef CONFIG_SLUB_DEBUG_ON -static void poison_error(mempool_t *pool, void *element, size_t size, +static void poison_error(struct mempool *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; @@ -57,7 +57,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size, dump_stack(); } -static void __check_element(mempool_t *pool, void *element, size_t size) +static void __check_element(struct mempool *pool, void *element, size_t size) { u8 *obj = element; size_t i; @@ -73,7 +73,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size) memset(obj, POISON_INUSE, size); } -static void check_element(mempool_t *pool, void *element) +static void check_element(struct mempool *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -102,7 +102,7 @@ static void __poison_element(void *element, size_t size) obj[size - 1] = POISON_END; } -static void poison_element(mempool_t *pool, void *element) +static void poison_element(struct mempool *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -123,15 +123,16 @@ static void poison_element(mempool_t *pool, void *element) } } #else /* CONFIG_SLUB_DEBUG_ON */ -static inline void check_element(mempool_t *pool, void *element) +static inline void check_element(struct mempool *pool, void *element) { } -static inline void poison_element(mempool_t *pool, void *element) +static inline void poison_element(struct mempool *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ -static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) +static __always_inline bool kasan_poison_element(struct mempool *pool, + void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); @@ -141,7 +142,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) return true; } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(struct mempool *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); @@ -153,7 +154,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) (unsigned long)pool->pool_data); } -static __always_inline void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(struct mempool *pool, void *element) { BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); @@ -161,7 +162,7 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(struct mempool *pool) { void *element = pool->elements[--pool->curr_nr]; @@ -182,7 +183,7 @@ static void *remove_element(mempool_t *pool) * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ -void mempool_exit(mempool_t *pool) +void mempool_exit(struct mempool *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -201,7 +202,7 @@ EXPORT_SYMBOL(mempool_exit); * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ -void mempool_destroy(mempool_t *pool) +void mempool_destroy(struct mempool *pool) { if (unlikely(!pool)) return; @@ -211,9 +212,9 @@ void mempool_destroy(mempool_t *pool) } EXPORT_SYMBOL(mempool_destroy); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; @@ -263,8 +264,9 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); @@ -290,11 +292,11 @@ EXPORT_SYMBOL(mempool_init_noprof); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { - mempool_t *pool; + struct mempool *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) @@ -328,7 +330,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof); * * Return: %0 on success, negative error code otherwise. */ -int mempool_resize(mempool_t *pool, int new_min_nr) +int mempool_resize(struct mempool *pool, int new_min_nr) { void *element; void **new_elements; @@ -530,7 +532,7 @@ EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); * an element. Allocation failure can only happen when @gfp_mask does not * include %__GFP_DIRECT_RECLAIM. */ -void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) { gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; @@ -582,7 +584,7 @@ EXPORT_SYMBOL(mempool_alloc_noprof); * Return: pointer to the allocated element or %NULL if no elements are * available. */ -void *mempool_alloc_preallocated(mempool_t *pool) +void *mempool_alloc_preallocated(struct mempool *pool) { void *element = NULL; From 07723a41eee9525a90d027f7ca49d33fcd47e775 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:52 +0100 Subject: [PATCH 40/42] mempool: drop the file name in the top of file comment Mentioning the name of the file is redundant, so drop it. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-12-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 89ab7bba5c9c..efb383a94a28 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/mempool.c - * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. From 48233291461b0539d798d00aaacccf1b3b163102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Oct 2025 14:17:23 +0200 Subject: [PATCH 41/42] mempool: clarify behavior of mempool_alloc_preallocated() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documentation of that function promises to never sleep. However on PREEMPT_RT a spinlock_t might in fact sleep. Reword the documentation so users can predict its behavior better. mempool could also replace spinlock_t with raw_spinlock_t which doesn't sleep even on PREEMPT_RT but that would take away the improved preemptibility of sleeping locks. Link: https://lkml.kernel.org/r/20251014-mempool-doc-v1-1-bc9ebf169700@linutronix.de Signed-off-by: Thomas Weißschuh Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: "Vishal Moola (Oracle)" Signed-off-by: Andrew Morton Signed-off-by: Vlastimil Babka --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index efb383a94a28..bb596cac57ff 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -576,8 +576,8 @@ EXPORT_SYMBOL(mempool_alloc_noprof); * @pool: pointer to the memory pool * * This function is similar to mempool_alloc(), but it only attempts allocating - * an element from the preallocated elements. It does not sleep and immediately - * returns if no preallocated elements are available. + * an element from the preallocated elements. It only takes a single spinlock_t + * and immediately returns if no preallocated elements are available. * * Return: pointer to the allocated element or %NULL if no elements are * available. From b55590558ff7c66c4a494af8ea08999c27594bc8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Nov 2025 14:23:27 +0000 Subject: [PATCH 42/42] slab: Remove unnecessary call to compound_head() in alloc_from_pcs() Each page knows which node it belongs to, so there's no need to convert to a folio. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20251124142329.1691780-1-willy@infradead.org Signed-off-by: Vlastimil Babka --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 8e5da7b6efe4..25cc3d59337b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5149,7 +5149,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * be false because of cpu migration during an unlocked part of * the current allocation or previous freeing process. */ - if (folio_nid(virt_to_folio(object)) != node) { + if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); return NULL; }