mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-03 21:45:08 -04:00
Merge branch 'fix-late-dma-unmap-crash-for-page-pool'
Toke Høiland-Jørgensen says: ==================== Fix late DMA unmap crash for page pool This series fixes the late dma_unmap crash for page pool first reported by Yonglong Liu in [0]. It is an alternative approach to the one submitted by Yunsheng Lin, most recently in [1]. The first commit just wraps some tests in a helper function, in preparation of the main change in patch 2. See the commit message of patch 2 for the details. [0] https://lore.kernel.org/8067f204-1380-4d37-8ffd-007fc6f26738@kernel.org [1] https://lore.kernel.org/20250307092356.638242-1-linyunsheng@huawei.com v8: https://lore.kernel.org/20250407-page-pool-track-dma-v8-0-da9500d4ba21@redhat.com v7: https://lore.kernel.org/20250404-page-pool-track-dma-v7-0-ad34f069bc18@redhat.com v6: https://lore.kernel.org/20250401-page-pool-track-dma-v6-0-8b83474870d4@redhat.com v5: https://lore.kernel.org/20250328-page-pool-track-dma-v5-0-55002af683ad@redhat.com v4: https://lore.kernel.org/20250327-page-pool-track-dma-v4-0-b380dc6706d0@redhat.com v3: https://lore.kernel.org/20250326-page-pool-track-dma-v3-0-8e464016e0ac@redhat.com v2: https://lore.kernel.org/20250325-page-pool-track-dma-v2-0-113ebc1946f3@redhat.com v1: https://lore.kernel.org/20250314-page-pool-track-dma-v1-0-c212e57a74c2@redhat.com ==================== Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-0-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -707,8 +707,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
|
||||
xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
|
||||
page = xdpi.page.page;
|
||||
|
||||
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
|
||||
* as we know this is a page_pool page.
|
||||
/* No need to check page_pool_page_is_pp() as we
|
||||
* know this is a page_pool page.
|
||||
*/
|
||||
page_pool_recycle_direct(page->pp, page);
|
||||
} while (++n < num);
|
||||
|
||||
@@ -4248,4 +4248,62 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
|
||||
#define VM_SEALED_SYSMAP VM_NONE
|
||||
#endif
|
||||
|
||||
/*
|
||||
* DMA mapping IDs for page_pool
|
||||
*
|
||||
* When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
|
||||
* stashes it in the upper bits of page->pp_magic. We always want to be able to
|
||||
* unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
|
||||
* pages can have arbitrary kernel pointers stored in the same field as pp_magic
|
||||
* (since it overlaps with page->lru.next), so we must ensure that we cannot
|
||||
* mistake a valid kernel pointer with any of the values we write into this
|
||||
* field.
|
||||
*
|
||||
* On architectures that set POISON_POINTER_DELTA, this is already ensured,
|
||||
* since this value becomes part of PP_SIGNATURE; meaning we can just use the
|
||||
* space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
|
||||
* lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
|
||||
* 0, we make sure that we leave the two topmost bits empty, as that guarantees
|
||||
* we won't mistake a valid kernel pointer for a value we set, regardless of the
|
||||
* VMSPLIT setting.
|
||||
*
|
||||
* Altogether, this means that the number of bits available is constrained by
|
||||
* the size of an unsigned long (at the upper end, subtracting two bits per the
|
||||
* above), and the definition of PP_SIGNATURE (with or without
|
||||
* POISON_POINTER_DELTA).
|
||||
*/
|
||||
#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
|
||||
#if POISON_POINTER_DELTA > 0
|
||||
/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
|
||||
* index to not overlap with that if set
|
||||
*/
|
||||
#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
|
||||
#else
|
||||
/* Always leave out the topmost two; see above. */
|
||||
#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2)
|
||||
#endif
|
||||
|
||||
#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
|
||||
PP_DMA_INDEX_SHIFT)
|
||||
|
||||
/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
|
||||
* OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
|
||||
* the head page of compound page and bit 1 for pfmemalloc page, as well as the
|
||||
* bits used for the DMA index. page_is_pfmemalloc() is checked in
|
||||
* __page_pool_put_page() to avoid recycling the pfmemalloc page.
|
||||
*/
|
||||
#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
|
||||
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
static inline bool page_pool_page_is_pp(struct page *page)
|
||||
{
|
||||
return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
|
||||
}
|
||||
#else
|
||||
static inline bool page_pool_page_is_pp(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_MM_H */
|
||||
|
||||
@@ -70,6 +70,10 @@
|
||||
#define KEY_DESTROY 0xbd
|
||||
|
||||
/********** net/core/page_pool.c **********/
|
||||
/*
|
||||
* page_pool uses additional free bits within this value to store data, see the
|
||||
* definition of PP_DMA_INDEX_MASK in mm.h
|
||||
*/
|
||||
#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA)
|
||||
|
||||
/********** net/core/skbuff.c **********/
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <linux/dma-direction.h>
|
||||
#include <linux/ptr_ring.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/xarray.h>
|
||||
#include <net/netmem.h>
|
||||
|
||||
#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA
|
||||
@@ -33,6 +34,9 @@
|
||||
#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \
|
||||
PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM)
|
||||
|
||||
/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */
|
||||
#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1)
|
||||
|
||||
/*
|
||||
* Fast allocation side cache array/stack
|
||||
*
|
||||
@@ -221,6 +225,8 @@ struct page_pool {
|
||||
void *mp_priv;
|
||||
const struct memory_provider_ops *mp_ops;
|
||||
|
||||
struct xarray dma_mapped;
|
||||
|
||||
#ifdef CONFIG_PAGE_POOL_STATS
|
||||
/* recycle stats are per-cpu to avoid locking */
|
||||
struct page_pool_recycle_stats __percpu *recycle_stats;
|
||||
|
||||
@@ -897,9 +897,7 @@ static inline bool page_expected_state(struct page *page,
|
||||
#ifdef CONFIG_MEMCG
|
||||
page->memcg_data |
|
||||
#endif
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
|
||||
#endif
|
||||
page_pool_page_is_pp(page) |
|
||||
(page->flags & check_flags)))
|
||||
return false;
|
||||
|
||||
@@ -926,10 +924,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
|
||||
if (unlikely(page->memcg_data))
|
||||
bad_reason = "page still charged to cgroup";
|
||||
#endif
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
|
||||
if (unlikely(page_pool_page_is_pp(page)))
|
||||
bad_reason = "page_pool leak";
|
||||
#endif
|
||||
return bad_reason;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
|
||||
{
|
||||
return __netmem_clear_lsb(netmem)->pp_magic;
|
||||
return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
|
||||
}
|
||||
|
||||
static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
|
||||
@@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
|
||||
|
||||
static inline void netmem_clear_pp_magic(netmem_ref netmem)
|
||||
{
|
||||
WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
|
||||
|
||||
__netmem_clear_lsb(netmem)->pp_magic = 0;
|
||||
}
|
||||
|
||||
static inline bool netmem_is_pp(netmem_ref netmem)
|
||||
{
|
||||
return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
|
||||
}
|
||||
|
||||
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
|
||||
{
|
||||
__netmem_clear_lsb(netmem)->pp = pool;
|
||||
@@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem,
|
||||
{
|
||||
__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
|
||||
}
|
||||
|
||||
static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
|
||||
{
|
||||
unsigned long magic;
|
||||
|
||||
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
|
||||
return 0;
|
||||
|
||||
magic = __netmem_clear_lsb(netmem)->pp_magic;
|
||||
|
||||
return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
|
||||
}
|
||||
|
||||
static inline void netmem_set_dma_index(netmem_ref netmem,
|
||||
unsigned long id)
|
||||
{
|
||||
unsigned long magic;
|
||||
|
||||
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
|
||||
return;
|
||||
|
||||
magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
|
||||
__netmem_clear_lsb(netmem)->pp_magic = magic;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool,
|
||||
/* Driver calling page_pool_create() also call page_pool_destroy() */
|
||||
refcount_set(&pool->user_cnt, 1);
|
||||
|
||||
if (pool->dma_map)
|
||||
get_device(pool->p.dev);
|
||||
xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
|
||||
|
||||
if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
|
||||
netdev_assert_locked(pool->slow.netdev);
|
||||
@@ -320,9 +319,7 @@ static int page_pool_init(struct page_pool *pool,
|
||||
static void page_pool_uninit(struct page_pool *pool)
|
||||
{
|
||||
ptr_ring_cleanup(&pool->ring, NULL);
|
||||
|
||||
if (pool->dma_map)
|
||||
put_device(pool->p.dev);
|
||||
xa_destroy(&pool->dma_mapped);
|
||||
|
||||
#ifdef CONFIG_PAGE_POOL_STATS
|
||||
if (!pool->system)
|
||||
@@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
|
||||
netmem_ref netmem,
|
||||
u32 dma_sync_size)
|
||||
{
|
||||
if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
|
||||
__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
|
||||
if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
|
||||
rcu_read_lock();
|
||||
/* re-check under rcu_read_lock() to sync with page_pool_scrub() */
|
||||
if (pool->dma_sync)
|
||||
__page_pool_dma_sync_for_device(pool, netmem,
|
||||
dma_sync_size);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
|
||||
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
|
||||
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
|
||||
{
|
||||
dma_addr_t dma;
|
||||
int err;
|
||||
u32 id;
|
||||
|
||||
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
|
||||
* since dma_addr_t can be either 32 or 64 bits and does not always fit
|
||||
@@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
|
||||
if (dma_mapping_error(pool->p.dev, dma))
|
||||
return false;
|
||||
|
||||
if (page_pool_set_dma_addr_netmem(netmem, dma))
|
||||
if (page_pool_set_dma_addr_netmem(netmem, dma)) {
|
||||
WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
|
||||
goto unmap_failed;
|
||||
}
|
||||
|
||||
if (in_softirq())
|
||||
err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
|
||||
PP_DMA_INDEX_LIMIT, gfp);
|
||||
else
|
||||
err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
|
||||
PP_DMA_INDEX_LIMIT, gfp);
|
||||
if (err) {
|
||||
WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
|
||||
goto unset_failed;
|
||||
}
|
||||
|
||||
netmem_set_dma_index(netmem, id);
|
||||
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
|
||||
|
||||
return true;
|
||||
|
||||
unset_failed:
|
||||
page_pool_set_dma_addr_netmem(netmem, 0);
|
||||
unmap_failed:
|
||||
WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
|
||||
dma_unmap_page_attrs(pool->p.dev, dma,
|
||||
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
|
||||
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
|
||||
@@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
|
||||
if (unlikely(!page))
|
||||
return NULL;
|
||||
|
||||
if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
|
||||
if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
|
||||
put_page(page);
|
||||
return NULL;
|
||||
}
|
||||
@@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
|
||||
*/
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
netmem = pool->alloc.cache[i];
|
||||
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
|
||||
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
|
||||
put_page(netmem_to_page(netmem));
|
||||
continue;
|
||||
}
|
||||
@@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem)
|
||||
static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
|
||||
netmem_ref netmem)
|
||||
{
|
||||
struct page *old, *page = netmem_to_page(netmem);
|
||||
unsigned long id;
|
||||
dma_addr_t dma;
|
||||
|
||||
if (!pool->dma_map)
|
||||
@@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
|
||||
*/
|
||||
return;
|
||||
|
||||
id = netmem_get_dma_index(netmem);
|
||||
if (!id)
|
||||
return;
|
||||
|
||||
if (in_softirq())
|
||||
old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
|
||||
else
|
||||
old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
|
||||
if (old != page)
|
||||
return;
|
||||
|
||||
dma = page_pool_get_dma_addr_netmem(netmem);
|
||||
|
||||
/* When page is unmapped, it cannot be returned to our pool */
|
||||
@@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
|
||||
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
|
||||
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
|
||||
page_pool_set_dma_addr_netmem(netmem, 0);
|
||||
netmem_set_dma_index(netmem, 0);
|
||||
}
|
||||
|
||||
/* Disconnects a page (from a page_pool). API users can have a need
|
||||
@@ -1080,8 +1114,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
|
||||
|
||||
static void page_pool_scrub(struct page_pool *pool)
|
||||
{
|
||||
unsigned long id;
|
||||
void *ptr;
|
||||
|
||||
page_pool_empty_alloc_cache_once(pool);
|
||||
pool->destroy_cnt++;
|
||||
if (!pool->destroy_cnt++ && pool->dma_map) {
|
||||
if (pool->dma_sync) {
|
||||
/* Disable page_pool_dma_sync_for_device() */
|
||||
pool->dma_sync = false;
|
||||
|
||||
/* Make sure all concurrent returns that may see the old
|
||||
* value of dma_sync (and thus perform a sync) have
|
||||
* finished before doing the unmapping below. Skip the
|
||||
* wait if the device doesn't actually need syncing, or
|
||||
* if there are no outstanding mapped pages.
|
||||
*/
|
||||
if (dma_dev_need_sync(pool->p.dev) &&
|
||||
!xa_empty(&pool->dma_mapped))
|
||||
synchronize_net();
|
||||
}
|
||||
|
||||
xa_for_each(&pool->dma_mapped, id, ptr)
|
||||
__page_pool_release_page_dma(pool, page_to_netmem(ptr));
|
||||
}
|
||||
|
||||
/* No more consumers should exist, but producers could still
|
||||
* be in-flight.
|
||||
|
||||
@@ -893,11 +893,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
|
||||
skb_get(list);
|
||||
}
|
||||
|
||||
static bool is_pp_netmem(netmem_ref netmem)
|
||||
{
|
||||
return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
|
||||
}
|
||||
|
||||
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
|
||||
unsigned int headroom)
|
||||
{
|
||||
@@ -995,14 +990,7 @@ bool napi_pp_put_page(netmem_ref netmem)
|
||||
{
|
||||
netmem = netmem_compound_head(netmem);
|
||||
|
||||
/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
|
||||
* in order to preserve any existing bits, such as bit 0 for the
|
||||
* head page of compound page and bit 1 for pfmemalloc page, so
|
||||
* mask those bits for freeing side when doing below checking,
|
||||
* and page_is_pfmemalloc() is checked in __page_pool_put_page()
|
||||
* to avoid recycling the pfmemalloc page.
|
||||
*/
|
||||
if (unlikely(!is_pp_netmem(netmem)))
|
||||
if (unlikely(!netmem_is_pp(netmem)))
|
||||
return false;
|
||||
|
||||
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
|
||||
@@ -1042,7 +1030,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
|
||||
|
||||
for (i = 0; i < shinfo->nr_frags; i++) {
|
||||
head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
|
||||
if (likely(is_pp_netmem(head_netmem)))
|
||||
if (likely(netmem_is_pp(head_netmem)))
|
||||
page_pool_ref_netmem(head_netmem);
|
||||
else
|
||||
page_ref_inc(netmem_to_page(head_netmem));
|
||||
|
||||
@@ -438,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
|
||||
netmem = netmem_compound_head(netmem);
|
||||
if (napi_direct && xdp_return_frame_no_direct())
|
||||
napi_direct = false;
|
||||
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
|
||||
* as mem->type knows this a page_pool page
|
||||
/* No need to check netmem_is_pp() as mem->type knows this a
|
||||
* page_pool page
|
||||
*/
|
||||
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
|
||||
napi_direct);
|
||||
|
||||
Reference in New Issue
Block a user