Merge branch 'fix-late-dma-unmap-crash-for-page-pool'

Toke Høiland-Jørgensen says:

====================
Fix late DMA unmap crash for page pool

This series fixes the late dma_unmap crash for page pool first reported
by Yonglong Liu in [0]. It is an alternative approach to the one
submitted by Yunsheng Lin, most recently in [1]. The first commit just
wraps some tests in a helper function, in preparation of the main change
in patch 2. See the commit message of patch 2 for the details.

[0] https://lore.kernel.org/8067f204-1380-4d37-8ffd-007fc6f26738@kernel.org
[1] https://lore.kernel.org/20250307092356.638242-1-linyunsheng@huawei.com

v8: https://lore.kernel.org/20250407-page-pool-track-dma-v8-0-da9500d4ba21@redhat.com
v7: https://lore.kernel.org/20250404-page-pool-track-dma-v7-0-ad34f069bc18@redhat.com
v6: https://lore.kernel.org/20250401-page-pool-track-dma-v6-0-8b83474870d4@redhat.com
v5: https://lore.kernel.org/20250328-page-pool-track-dma-v5-0-55002af683ad@redhat.com
v4: https://lore.kernel.org/20250327-page-pool-track-dma-v4-0-b380dc6706d0@redhat.com
v3: https://lore.kernel.org/20250326-page-pool-track-dma-v3-0-8e464016e0ac@redhat.com
v2: https://lore.kernel.org/20250325-page-pool-track-dma-v2-0-113ebc1946f3@redhat.com
v1: https://lore.kernel.org/20250314-page-pool-track-dma-v1-0-c212e57a74c2@redhat.com
====================

Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-0-6a9ef2e0cba8@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-04-14 16:30:35 -07:00
9 changed files with 176 additions and 38 deletions

View File

@@ -707,8 +707,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
page = xdpi.page.page;
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
* as we know this is a page_pool page.
/* No need to check page_pool_page_is_pp() as we
* know this is a page_pool page.
*/
page_pool_recycle_direct(page->pp, page);
} while (++n < num);

View File

@@ -4248,4 +4248,62 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
#define VM_SEALED_SYSMAP VM_NONE
#endif
/*
* DMA mapping IDs for page_pool
*
* When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
* stashes it in the upper bits of page->pp_magic. We always want to be able to
* unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
* pages can have arbitrary kernel pointers stored in the same field as pp_magic
* (since it overlaps with page->lru.next), so we must ensure that we cannot
* mistake a valid kernel pointer with any of the values we write into this
* field.
*
* On architectures that set POISON_POINTER_DELTA, this is already ensured,
* since this value becomes part of PP_SIGNATURE; meaning we can just use the
* space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
* lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
* 0, we make sure that we leave the two topmost bits empty, as that guarantees
* we won't mistake a valid kernel pointer for a value we set, regardless of the
* VMSPLIT setting.
*
* Altogether, this means that the number of bits available is constrained by
* the size of an unsigned long (at the upper end, subtracting two bits per the
* above), and the definition of PP_SIGNATURE (with or without
* POISON_POINTER_DELTA).
*/
#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
#if POISON_POINTER_DELTA > 0
/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
* index to not overlap with that if set
*/
#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
#else
/* Always leave out the topmost two; see above. */
#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2)
#endif
#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
PP_DMA_INDEX_SHIFT)
/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
* OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
* the head page of compound page and bit 1 for pfmemalloc page, as well as the
* bits used for the DMA index. page_is_pfmemalloc() is checked in
* __page_pool_put_page() to avoid recycling the pfmemalloc page.
*/
#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
#ifdef CONFIG_PAGE_POOL
static inline bool page_pool_page_is_pp(struct page *page)
{
return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
}
#else
static inline bool page_pool_page_is_pp(struct page *page)
{
return false;
}
#endif
#endif /* _LINUX_MM_H */

View File

@@ -70,6 +70,10 @@
#define KEY_DESTROY 0xbd
/********** net/core/page_pool.c **********/
/*
* page_pool uses additional free bits within this value to store data, see the
* definition of PP_DMA_INDEX_MASK in mm.h
*/
#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA)
/********** net/core/skbuff.c **********/

View File

@@ -6,6 +6,7 @@
#include <linux/dma-direction.h>
#include <linux/ptr_ring.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <net/netmem.h>
#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA
@@ -33,6 +34,9 @@
#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \
PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM)
/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */
#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1)
/*
* Fast allocation side cache array/stack
*
@@ -221,6 +225,8 @@ struct page_pool {
void *mp_priv;
const struct memory_provider_ops *mp_ops;
struct xarray dma_mapped;
#ifdef CONFIG_PAGE_POOL_STATS
/* recycle stats are per-cpu to avoid locking */
struct page_pool_recycle_stats __percpu *recycle_stats;

View File

@@ -897,9 +897,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
#ifdef CONFIG_PAGE_POOL
((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
#endif
page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -926,10 +924,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
#ifdef CONFIG_PAGE_POOL
if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
#endif
return bad_reason;
}

View File

@@ -5,7 +5,7 @@
static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
{
return __netmem_clear_lsb(netmem)->pp_magic;
return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
@@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
__netmem_clear_lsb(netmem)->pp_magic = 0;
}
static inline bool netmem_is_pp(netmem_ref netmem)
{
return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
}
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
{
__netmem_clear_lsb(netmem)->pp = pool;
@@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem,
{
__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
}
static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
{
unsigned long magic;
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
return 0;
magic = __netmem_clear_lsb(netmem)->pp_magic;
return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
}
static inline void netmem_set_dma_index(netmem_ref netmem,
unsigned long id)
{
unsigned long magic;
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
return;
magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
__netmem_clear_lsb(netmem)->pp_magic = magic;
}
#endif

View File

@@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool,
/* Driver calling page_pool_create() also call page_pool_destroy() */
refcount_set(&pool->user_cnt, 1);
if (pool->dma_map)
get_device(pool->p.dev);
xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
netdev_assert_locked(pool->slow.netdev);
@@ -320,9 +319,7 @@ static int page_pool_init(struct page_pool *pool,
static void page_pool_uninit(struct page_pool *pool)
{
ptr_ring_cleanup(&pool->ring, NULL);
if (pool->dma_map)
put_device(pool->p.dev);
xa_destroy(&pool->dma_mapped);
#ifdef CONFIG_PAGE_POOL_STATS
if (!pool->system)
@@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
netmem_ref netmem,
u32 dma_sync_size)
{
if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
rcu_read_lock();
/* re-check under rcu_read_lock() to sync with page_pool_scrub() */
if (pool->dma_sync)
__page_pool_dma_sync_for_device(pool, netmem,
dma_sync_size);
rcu_read_unlock();
}
}
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
dma_addr_t dma;
int err;
u32 id;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
@@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
if (dma_mapping_error(pool->p.dev, dma))
return false;
if (page_pool_set_dma_addr_netmem(netmem, dma))
if (page_pool_set_dma_addr_netmem(netmem, dma)) {
WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
goto unmap_failed;
}
if (in_softirq())
err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
PP_DMA_INDEX_LIMIT, gfp);
else
err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
PP_DMA_INDEX_LIMIT, gfp);
if (err) {
WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
goto unset_failed;
}
netmem_set_dma_index(netmem, id);
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
unset_failed:
page_pool_set_dma_addr_netmem(netmem, 0);
unmap_failed:
WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
@@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
put_page(page);
return NULL;
}
@@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
*/
for (i = 0; i < nr_pages; i++) {
netmem = pool->alloc.cache[i];
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
put_page(netmem_to_page(netmem));
continue;
}
@@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem)
static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
netmem_ref netmem)
{
struct page *old, *page = netmem_to_page(netmem);
unsigned long id;
dma_addr_t dma;
if (!pool->dma_map)
@@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
*/
return;
id = netmem_get_dma_index(netmem);
if (!id)
return;
if (in_softirq())
old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
else
old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
if (old != page)
return;
dma = page_pool_get_dma_addr_netmem(netmem);
/* When page is unmapped, it cannot be returned to our pool */
@@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr_netmem(netmem, 0);
netmem_set_dma_index(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -1080,8 +1114,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
static void page_pool_scrub(struct page_pool *pool)
{
unsigned long id;
void *ptr;
page_pool_empty_alloc_cache_once(pool);
pool->destroy_cnt++;
if (!pool->destroy_cnt++ && pool->dma_map) {
if (pool->dma_sync) {
/* Disable page_pool_dma_sync_for_device() */
pool->dma_sync = false;
/* Make sure all concurrent returns that may see the old
* value of dma_sync (and thus perform a sync) have
* finished before doing the unmapping below. Skip the
* wait if the device doesn't actually need syncing, or
* if there are no outstanding mapped pages.
*/
if (dma_dev_need_sync(pool->p.dev) &&
!xa_empty(&pool->dma_mapped))
synchronize_net();
}
xa_for_each(&pool->dma_mapped, id, ptr)
__page_pool_release_page_dma(pool, page_to_netmem(ptr));
}
/* No more consumers should exist, but producers could still
* be in-flight.

View File

@@ -893,11 +893,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
static bool is_pp_netmem(netmem_ref netmem)
{
return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
}
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -995,14 +990,7 @@ bool napi_pp_put_page(netmem_ref netmem)
{
netmem = netmem_compound_head(netmem);
/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
* in order to preserve any existing bits, such as bit 0 for the
* head page of compound page and bit 1 for pfmemalloc page, so
* mask those bits for freeing side when doing below checking,
* and page_is_pfmemalloc() is checked in __page_pool_put_page()
* to avoid recycling the pfmemalloc page.
*/
if (unlikely(!is_pp_netmem(netmem)))
if (unlikely(!netmem_is_pp(netmem)))
return false;
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
@@ -1042,7 +1030,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
for (i = 0; i < shinfo->nr_frags; i++) {
head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
if (likely(is_pp_netmem(head_netmem)))
if (likely(netmem_is_pp(head_netmem)))
page_pool_ref_netmem(head_netmem);
else
page_ref_inc(netmem_to_page(head_netmem));

View File

@@ -438,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
* as mem->type knows this a page_pool page
/* No need to check netmem_is_pp() as mem->type knows this a
* page_pool page
*/
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
napi_direct);